devices/pci/
coiommu.rs

1// Copyright 2022 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! This is the CoIOMMU backend implementation. CoIOMMU is a virtual device
6//! which provide fine-grained pinning for the VFIO pci-passthrough device
7//! so that hypervisor doesn't need to pin the enter VM's memory to improve
8//! the memory utilization. CoIOMMU doesn't provide the intra-guest protection
9//! so it can only be used for the TRUSTED passthrough devices.
10//!
11//! CoIOMMU is presented at KVM forum 2020:
12//! <https://kvmforum2020.sched.com/event/eE2z/a-virtual-iommu-with-cooperative-dma-buffer-tracking-yu-zhang-intel>
13//!
14//! Also presented at usenix ATC20:
15//! <https://www.usenix.org/conference/atc20/presentation/tian>
16
17use std::collections::VecDeque;
18use std::convert::TryInto;
19use std::default::Default;
20use std::fmt;
21use std::mem;
22use std::panic;
23use std::sync::atomic::fence;
24use std::sync::atomic::AtomicU32;
25use std::sync::atomic::Ordering;
26use std::sync::Arc;
27use std::thread;
28use std::time::Duration;
29
30use anyhow::bail;
31use anyhow::ensure;
32use anyhow::Context;
33use anyhow::Result;
34use base::error;
35use base::info;
36use base::AsRawDescriptor;
37use base::Event;
38use base::EventToken;
39use base::MemoryMapping;
40use base::MemoryMappingBuilder;
41use base::Protection;
42use base::RawDescriptor;
43use base::SafeDescriptor;
44use base::SharedMemory;
45use base::Timer;
46use base::TimerTrait;
47use base::Tube;
48use base::TubeError;
49use base::WaitContext;
50use base::WorkerThread;
51use hypervisor::Datamatch;
52use hypervisor::MemCacheType;
53use resources::Alloc;
54use resources::AllocOptions;
55use resources::SystemAllocator;
56use serde::Deserialize;
57use serde::Deserializer;
58use serde::Serialize;
59use serde_keyvalue::FromKeyValues;
60use sync::Mutex;
61use thiserror::Error as ThisError;
62use vm_control::api::VmMemoryClient;
63use vm_control::VmMemoryDestination;
64use vm_control::VmMemorySource;
65use vm_memory::GuestAddress;
66use vm_memory::GuestMemory;
67use zerocopy::FromBytes;
68use zerocopy::IntoBytes;
69
70use crate::pci::pci_configuration::PciBarConfiguration;
71use crate::pci::pci_configuration::PciBarPrefetchable;
72use crate::pci::pci_configuration::PciBarRegionType;
73use crate::pci::pci_configuration::PciClassCode;
74use crate::pci::pci_configuration::PciConfiguration;
75use crate::pci::pci_configuration::PciHeaderType;
76use crate::pci::pci_configuration::PciOtherSubclass;
77use crate::pci::pci_configuration::COMMAND_REG;
78use crate::pci::pci_configuration::COMMAND_REG_MEMORY_SPACE_MASK;
79use crate::pci::pci_device::BarRange;
80use crate::pci::pci_device::PciDevice;
81use crate::pci::pci_device::Result as PciResult;
82use crate::pci::PciAddress;
83use crate::pci::PciBarIndex;
84use crate::pci::PciDeviceError;
85use crate::vfio::VfioContainer;
86use crate::Suspendable;
87use crate::UnpinRequest;
88use crate::UnpinResponse;
89
90const PCI_VENDOR_ID_COIOMMU: u16 = 0x1234;
91const PCI_DEVICE_ID_COIOMMU: u16 = 0xabcd;
92const COIOMMU_CMD_DEACTIVATE: u64 = 0;
93const COIOMMU_CMD_ACTIVATE: u64 = 1;
94const COIOMMU_CMD_PARK_UNPIN: u64 = 2;
95const COIOMMU_CMD_UNPARK_UNPIN: u64 = 3;
96const COIOMMU_REVISION_ID: u8 = 0x10;
97const COIOMMU_MMIO_BAR: PciBarIndex = 0;
98const COIOMMU_MMIO_BAR_SIZE: u64 = 0x2000;
99const COIOMMU_NOTIFYMAP_BAR: PciBarIndex = 2;
100const COIOMMU_NOTIFYMAP_SIZE: usize = 0x2000;
101const COIOMMU_TOPOLOGYMAP_BAR: u8 = 4;
102const COIOMMU_TOPOLOGYMAP_SIZE: usize = 0x2000;
103const PAGE_SIZE_4K: u64 = 4096;
104const PAGE_SHIFT_4K: u64 = 12;
105const PIN_PAGES_IN_BATCH: u64 = 1 << 63;
106
107const DTTE_PINNED_FLAG: u32 = 1 << 31;
108const DTTE_ACCESSED_FLAG: u32 = 1 << 30;
109const DTT_ENTRY_PRESENT: u64 = 1;
110const DTT_ENTRY_PFN_SHIFT: u64 = 12;
111
112#[derive(ThisError, Debug)]
113enum Error {
114    #[error("CoIommu failed to create shared memory")]
115    CreateSharedMemory,
116    #[error("Failed to get DTT entry")]
117    GetDTTEntry,
118}
119
120//default interval is 60s
121const UNPIN_DEFAULT_INTERVAL: Duration = Duration::from_secs(60);
122const UNPIN_GEN_DEFAULT_THRES: u64 = 10;
123/// Holds the coiommu unpin policy
124#[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Serialize, Deserialize)]
125#[serde(rename_all = "kebab-case")]
126pub enum CoIommuUnpinPolicy {
127    #[default]
128    Off,
129    Lru,
130}
131
132impl fmt::Display for CoIommuUnpinPolicy {
133    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
134        use self::CoIommuUnpinPolicy::*;
135
136        match self {
137            Off => write!(f, "off"),
138            Lru => write!(f, "lru"),
139        }
140    }
141}
142
143fn deserialize_unpin_interval<'de, D: Deserializer<'de>>(
144    deserializer: D,
145) -> Result<Duration, D::Error> {
146    let secs = u64::deserialize(deserializer)?;
147
148    Ok(Duration::from_secs(secs))
149}
150
151fn deserialize_unpin_limit<'de, D: Deserializer<'de>>(
152    deserializer: D,
153) -> Result<Option<u64>, D::Error> {
154    let limit = u64::deserialize(deserializer)?;
155
156    match limit {
157        0 => Err(serde::de::Error::custom(
158            "Please use non-zero unpin_limit value",
159        )),
160        limit => Ok(Some(limit)),
161    }
162}
163
164fn unpin_interval_default() -> Duration {
165    UNPIN_DEFAULT_INTERVAL
166}
167
168fn unpin_gen_threshold_default() -> u64 {
169    UNPIN_GEN_DEFAULT_THRES
170}
171
172/// Holds the parameters for a coiommu device
173#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize, FromKeyValues)]
174#[serde(deny_unknown_fields)]
175pub struct CoIommuParameters {
176    #[serde(default)]
177    pub unpin_policy: CoIommuUnpinPolicy,
178    #[serde(
179        deserialize_with = "deserialize_unpin_interval",
180        default = "unpin_interval_default"
181    )]
182    pub unpin_interval: Duration,
183    #[serde(deserialize_with = "deserialize_unpin_limit", default)]
184    pub unpin_limit: Option<u64>,
185    // Number of unpin intervals a pinned page must be busy for to be aged into the
186    // older, less frequently checked generation.
187    #[serde(default = "unpin_gen_threshold_default")]
188    pub unpin_gen_threshold: u64,
189}
190
191impl Default for CoIommuParameters {
192    fn default() -> Self {
193        Self {
194            unpin_policy: CoIommuUnpinPolicy::Off,
195            unpin_interval: UNPIN_DEFAULT_INTERVAL,
196            unpin_limit: None,
197            unpin_gen_threshold: UNPIN_GEN_DEFAULT_THRES,
198        }
199    }
200}
201
202#[derive(Default, Debug, Copy, Clone)]
203struct CoIommuReg {
204    dtt_root: u64,
205    cmd: u64,
206    dtt_level: u64,
207}
208
209#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
210struct PinnedPageInfo {
211    gfn: u64,
212    unpin_busy_cnt: u64,
213}
214
215impl PinnedPageInfo {
216    fn new(gfn: u64, unpin_busy_cnt: u64) -> Self {
217        PinnedPageInfo {
218            gfn,
219            unpin_busy_cnt,
220        }
221    }
222}
223
224#[derive(PartialEq, Debug, Eq)]
225enum UnpinThreadState {
226    Unparked,
227    Parked,
228}
229
230struct CoIommuPinState {
231    new_gen_pinned_pages: VecDeque<PinnedPageInfo>,
232    old_gen_pinned_pages: VecDeque<u64>,
233    unpin_thread_state: UnpinThreadState,
234    unpin_park_count: u64,
235}
236
237unsafe fn vfio_map(
238    vfio_container: &Arc<Mutex<VfioContainer>>,
239    iova: u64,
240    size: u64,
241    user_addr: u64,
242) -> bool {
243    match vfio_container
244        .lock()
245        .vfio_dma_map(iova, size, user_addr, true)
246    {
247        Ok(_) => true,
248        Err(e) => {
249            if let Some(errno) = std::io::Error::last_os_error().raw_os_error() {
250                if errno == libc::EEXIST {
251                    // Already pinned. set PINNED flag
252                    error!("CoIommu: iova 0x{:x} already pinned", iova);
253                    return true;
254                }
255            }
256            error!("CoIommu: failed to map iova 0x{:x}: {}", iova, e);
257            false
258        }
259    }
260}
261
262fn vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool {
263    match vfio_container.lock().vfio_dma_unmap(iova, size) {
264        Ok(_) => true,
265        Err(e) => {
266            error!("CoIommu: failed to unmap iova 0x{:x}: {}", iova, e);
267            false
268        }
269    }
270}
271
272#[derive(Default, Debug, Copy, Clone, FromBytes, IntoBytes)]
273#[repr(C)]
274struct PinPageInfo {
275    bdf: u16,
276    pad: [u16; 3],
277    nr_pages: u64,
278}
279
280const COIOMMU_UPPER_LEVEL_STRIDE: u64 = 9;
281const COIOMMU_UPPER_LEVEL_MASK: u64 = (1 << COIOMMU_UPPER_LEVEL_STRIDE) - 1;
282const COIOMMU_PT_LEVEL_STRIDE: u64 = 10;
283const COIOMMU_PT_LEVEL_MASK: u64 = (1 << COIOMMU_PT_LEVEL_STRIDE) - 1;
284
285fn level_to_offset(gfn: u64, level: u64) -> Result<u64> {
286    if level == 1 {
287        return Ok(gfn & COIOMMU_PT_LEVEL_MASK);
288    }
289
290    if level == 0 {
291        bail!("Invalid level for gfn 0x{:x}", gfn);
292    }
293
294    let offset = COIOMMU_PT_LEVEL_STRIDE + (level - 2) * COIOMMU_UPPER_LEVEL_STRIDE;
295
296    Ok((gfn >> offset) & COIOMMU_UPPER_LEVEL_MASK)
297}
298
299struct DTTIter {
300    ptr: *const u8,
301    gfn: u64,
302}
303
304impl Default for DTTIter {
305    fn default() -> Self {
306        DTTIter {
307            ptr: std::ptr::null(),
308            gfn: 0,
309        }
310    }
311}
312
313// Get a DMA Tracking Table(DTT) entry associated with the gfn.
314//
315// There are two ways to get the entry:
316// #1. Walking the DMA Tracking Table(DTT) by the GFN to get the
317// corresponding entry. The DTT is shared between frontend and
318// backend. It is page-table-like strctures and the entry is indexed
319// by GFN. The argument dtt_root represents the root page
320// pga and dtt_level represents the maximum page table level.
321//
322// #2. Calculate the entry address via the argument dtt_iter. dtt_iter
323// stores an entry address and the associated gfn. If the target gfn is
324// in the same page table page with the gfn in dtt_iter, then can
325// calculate the target entry address based on the entry address in
326// dtt_iter.
327//
328// As the DTT entry is shared between frontend and backend, the accessing
329// should be atomic. So the returned value is converted to an AtomicU32
330// pointer.
331fn gfn_to_dtt_pte(
332    mem: &GuestMemory,
333    dtt_level: u64,
334    dtt_root: u64,
335    dtt_iter: &mut DTTIter,
336    gfn: u64,
337) -> Result<*const AtomicU32> {
338    let ptr = if dtt_iter.ptr.is_null()
339        || dtt_iter.gfn >> COIOMMU_PT_LEVEL_STRIDE != gfn >> COIOMMU_PT_LEVEL_STRIDE
340    {
341        // Slow path to walk the DTT to get the pte entry
342        let mut level = dtt_level;
343        let mut pt_gpa = dtt_root;
344        let dtt_nonleaf_entry_size = mem::size_of::<u64>() as u64;
345
346        while level != 1 {
347            let index = level_to_offset(gfn, level)? * dtt_nonleaf_entry_size;
348            let parent_pt = mem
349                .read_obj_from_addr::<u64>(GuestAddress(pt_gpa + index))
350                .context(Error::GetDTTEntry)?;
351
352            if (parent_pt & DTT_ENTRY_PRESENT) == 0 {
353                bail!("DTT absent at level {} for gfn 0x{:x}", level, gfn);
354            }
355
356            pt_gpa = (parent_pt >> DTT_ENTRY_PFN_SHIFT) << PAGE_SHIFT_4K;
357            level -= 1;
358        }
359
360        let index = level_to_offset(gfn, level)? * mem::size_of::<u32>() as u64;
361
362        mem.get_host_address(GuestAddress(pt_gpa + index))
363            .context(Error::GetDTTEntry)?
364    } else if gfn > dtt_iter.gfn {
365        // SAFETY:
366        // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
367        // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
368        // means the calculated ptr will point to the same page as dtt_iter.ptr
369        unsafe {
370            dtt_iter
371                .ptr
372                .add(mem::size_of::<AtomicU32>() * (gfn - dtt_iter.gfn) as usize)
373        }
374    } else {
375        // SAFETY:
376        // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
377        // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
378        // means the calculated ptr will point to the same page as dtt_iter.ptr
379        unsafe {
380            dtt_iter
381                .ptr
382                .sub(mem::size_of::<AtomicU32>() * (dtt_iter.gfn - gfn) as usize)
383        }
384    };
385
386    dtt_iter.ptr = ptr;
387    dtt_iter.gfn = gfn;
388
389    Ok(ptr as *const AtomicU32)
390}
391
392fn pin_page(
393    pinstate: &mut CoIommuPinState,
394    policy: CoIommuUnpinPolicy,
395    vfio_container: &Arc<Mutex<VfioContainer>>,
396    mem: &GuestMemory,
397    dtt_level: u64,
398    dtt_root: u64,
399    dtt_iter: &mut DTTIter,
400    gfn: u64,
401) -> Result<()> {
402    let leaf_entry = gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn)?;
403
404    let gpa = gfn << PAGE_SHIFT_4K;
405    let host_addr = mem
406        .get_host_address_range(GuestAddress(gpa), PAGE_SIZE_4K as usize)
407        .context("failed to get host address")? as u64;
408
409    // SAFETY:
410    // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
411    // Test PINNED flag
412    if (unsafe { (*leaf_entry).load(Ordering::Relaxed) } & DTTE_PINNED_FLAG) != 0 {
413        info!("CoIommu: gfn 0x{:x} already pinned", gfn);
414        return Ok(());
415    }
416
417    // SAFETY:
418    // Safe because the gpa is valid from the gfn_to_dtt_pte and the host_addr
419    // is guaranteed by MemoryMapping interface.
420    if unsafe { vfio_map(vfio_container, gpa, PAGE_SIZE_4K, host_addr) } {
421        // SAFETY:
422        // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
423        // set PINNED flag
424        unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
425        if policy == CoIommuUnpinPolicy::Lru {
426            pinstate
427                .new_gen_pinned_pages
428                .push_back(PinnedPageInfo::new(gfn, 0));
429        }
430    }
431
432    Ok(())
433}
434
435#[derive(PartialEq, Debug, Eq)]
436enum UnpinResult {
437    UnpinlistEmpty,
438    Unpinned,
439    NotPinned,
440    NotUnpinned,
441    FailedUnpin,
442    UnpinParked,
443}
444
445fn unpin_page(
446    pinstate: &mut CoIommuPinState,
447    vfio_container: &Arc<Mutex<VfioContainer>>,
448    mem: &GuestMemory,
449    dtt_level: u64,
450    dtt_root: u64,
451    dtt_iter: &mut DTTIter,
452    gfn: u64,
453    force: bool,
454) -> UnpinResult {
455    if pinstate.unpin_thread_state == UnpinThreadState::Parked {
456        return UnpinResult::UnpinParked;
457    }
458
459    let leaf_entry = match gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn) {
460        Ok(v) => v,
461        Err(_) => {
462            // The case force == true may try to unpin a page which is not
463            // mapped in the dtt. For such page, the pte doesn't exist yet
464            // thus don't need to report any error log.
465            // The case force == false is used by coiommu to periodically
466            // unpin the pages which have been mapped in dtt, thus the pte
467            // for such page does exist. However with the unpin request from
468            // virtio balloon, such pages can be unpinned already and the DTT
469            // pages might be reclaimed by the Guest OS kernel as well, thus
470            // it is also possible to be here. Not to report an error log.
471            return UnpinResult::NotPinned;
472        }
473    };
474
475    if force {
476        // SAFETY:
477        // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
478        // This case is for balloon to evict pages so these pages should
479        // already been locked by balloon and no device driver in VM is
480        // able to access these pages, so just clear ACCESSED flag first
481        // to make sure the following unpin can be success.
482        unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
483    }
484
485    // SAFETY:
486    // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
487    if let Err(entry) = unsafe {
488        (*leaf_entry).compare_exchange(DTTE_PINNED_FLAG, 0, Ordering::SeqCst, Ordering::SeqCst)
489    } {
490        // The compare_exchange failed as the original leaf entry is
491        // not DTTE_PINNED_FLAG so cannot do the unpin.
492        if entry == 0 {
493            // The GFN is already unpinned. This is very similar to the
494            // gfn_to_dtt_pte error case, with the only difference being
495            // that the dtt_pte happens to be on a present page table.
496            UnpinResult::NotPinned
497        } else {
498            if !force {
499                // SAFETY:
500                // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
501                // The ACCESSED_FLAG is set by the guest if guest requires DMA map for
502                // this page. It represents whether or not this page is touched by the
503                // guest. By clearing this flag after an unpin work, we can detect if
504                // this page has been touched by the guest in the next round of unpin
505                // work. If the ACCESSED_FLAG is set at the next round, unpin this page
506                // will be failed and we will be here again to clear this flag. If this
507                // flag is not set at the next round, unpin this page will be probably
508                // success.
509                unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
510            } else {
511                // If we're here, then the guest is trying to release a page via the
512                // balloon that it still has pinned. This most likely that something is
513                // wrong in the guest kernel. Just leave the page pinned and log
514                // an error.
515                // This failure blocks the balloon from removing the page, which ensures
516                // that the guest's view of memory will remain consistent with device
517                // DMA's view of memory. Also note that the host kernel maintains an
518                // elevated refcount for pinned pages, which is a second guarantee the
519                // pages accessible by device DMA won't be freed until after they are
520                // unpinned.
521                error!(
522                    "CoIommu: force case cannot pin gfn 0x{:x} entry 0x{:x}",
523                    gfn, entry
524                );
525            }
526            // GFN cannot be unpinned either because the unmap count
527            // is non-zero or the it has accessed flag set.
528            UnpinResult::NotUnpinned
529        }
530    } else {
531        // The compare_exchange success as the original leaf entry is
532        // DTTE_PINNED_FLAG and the new leaf entry is 0 now. Unpin the
533        // page.
534        let gpa = gfn << PAGE_SHIFT_4K;
535        if vfio_unmap(vfio_container, gpa, PAGE_SIZE_4K) {
536            UnpinResult::Unpinned
537        } else {
538            // SAFETY:
539            // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
540            // make sure the pinned flag is set
541            unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
542            // need to put this gfn back to pinned vector
543            UnpinResult::FailedUnpin
544        }
545    }
546}
547
548struct PinWorker {
549    mem: GuestMemory,
550    endpoints: Vec<u16>,
551    notifymap_mmap: Arc<MemoryMapping>,
552    dtt_level: u64,
553    dtt_root: u64,
554    ioevents: Vec<Event>,
555    vfio_container: Arc<Mutex<VfioContainer>>,
556    pinstate: Arc<Mutex<CoIommuPinState>>,
557    params: CoIommuParameters,
558}
559
560impl PinWorker {
561    fn debug_label(&self) -> &'static str {
562        "CoIommuPinWorker"
563    }
564
565    fn run(&mut self, kill_evt: Event) {
566        #[derive(EventToken)]
567        enum Token {
568            Kill,
569            Pin { index: usize },
570        }
571
572        let wait_ctx: WaitContext<Token> =
573            match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
574                Ok(pc) => pc,
575                Err(e) => {
576                    error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
577                    return;
578                }
579            };
580
581        for (index, event) in self.ioevents.iter().enumerate() {
582            match wait_ctx.add(event, Token::Pin { index }) {
583                Ok(_) => {}
584                Err(e) => {
585                    error!(
586                        "{}: failed to add ioevent for index {}: {}",
587                        self.debug_label(),
588                        index,
589                        e
590                    );
591                    return;
592                }
593            }
594        }
595
596        'wait: loop {
597            let events = match wait_ctx.wait() {
598                Ok(v) => v,
599                Err(e) => {
600                    error!("{}: failed polling for events: {}", self.debug_label(), e);
601                    break;
602                }
603            };
604
605            for event in events.iter().filter(|e| e.is_readable) {
606                match event.token {
607                    Token::Kill => break 'wait,
608                    Token::Pin { index } => {
609                        let offset = index * mem::size_of::<u64>();
610                        if let Some(event) = self.ioevents.get(index) {
611                            if let Err(e) = event.wait() {
612                                error!(
613                                    "{}: failed reading event {}: {}",
614                                    self.debug_label(),
615                                    index,
616                                    e
617                                );
618                                self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
619                                break 'wait;
620                            }
621                        }
622                        if let Ok(data) = self.notifymap_mmap.read_obj::<u64>(offset) {
623                            if let Err(e) = self.pin_pages(data) {
624                                error!("{}: {}", self.debug_label(), e);
625                            }
626                        }
627                        fence(Ordering::SeqCst);
628                        self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
629                    }
630                }
631            }
632        }
633    }
634
635    fn pin_pages_in_batch(&mut self, gpa: u64) -> Result<()> {
636        let pin_page_info = self
637            .mem
638            .read_obj_from_addr::<PinPageInfo>(GuestAddress(gpa))
639            .context("failed to get pin page info")?;
640
641        let bdf = pin_page_info.bdf;
642        ensure!(
643            self.endpoints.contains(&bdf),
644            "pin page for unexpected bdf 0x{:x}",
645            bdf
646        );
647
648        let mut nr_pages = pin_page_info.nr_pages;
649        let mut offset = mem::size_of::<PinPageInfo>() as u64;
650        let mut dtt_iter: DTTIter = Default::default();
651        let mut pinstate = self.pinstate.lock();
652        while nr_pages > 0 {
653            let gfn = self
654                .mem
655                .read_obj_from_addr::<u64>(GuestAddress(gpa + offset))
656                .context("failed to get pin page gfn")?;
657
658            pin_page(
659                &mut pinstate,
660                self.params.unpin_policy,
661                &self.vfio_container,
662                &self.mem,
663                self.dtt_level,
664                self.dtt_root,
665                &mut dtt_iter,
666                gfn,
667            )?;
668
669            offset += mem::size_of::<u64>() as u64;
670            nr_pages -= 1;
671        }
672
673        Ok(())
674    }
675
676    fn pin_pages(&mut self, gfn_bdf: u64) -> Result<()> {
677        if gfn_bdf & PIN_PAGES_IN_BATCH != 0 {
678            let gpa = gfn_bdf & !PIN_PAGES_IN_BATCH;
679            self.pin_pages_in_batch(gpa)
680        } else {
681            let bdf = (gfn_bdf & 0xffff) as u16;
682            let gfn = gfn_bdf >> 16;
683            let mut dtt_iter: DTTIter = Default::default();
684            ensure!(
685                self.endpoints.contains(&bdf),
686                "pin page for unexpected bdf 0x{:x}",
687                bdf
688            );
689
690            let mut pinstate = self.pinstate.lock();
691            pin_page(
692                &mut pinstate,
693                self.params.unpin_policy,
694                &self.vfio_container,
695                &self.mem,
696                self.dtt_level,
697                self.dtt_root,
698                &mut dtt_iter,
699                gfn,
700            )
701        }
702    }
703}
704
705struct UnpinWorker {
706    mem: GuestMemory,
707    dtt_level: u64,
708    dtt_root: u64,
709    vfio_container: Arc<Mutex<VfioContainer>>,
710    unpin_tube: Option<Tube>,
711    pinstate: Arc<Mutex<CoIommuPinState>>,
712    params: CoIommuParameters,
713    unpin_gen_threshold: u64,
714}
715
716impl UnpinWorker {
717    fn debug_label(&self) -> &'static str {
718        "CoIommuUnpinWorker"
719    }
720
721    fn run(&mut self, kill_evt: Event) {
722        #[derive(EventToken)]
723        enum Token {
724            UnpinTimer,
725            UnpinReq,
726            Kill,
727        }
728
729        let wait_ctx: WaitContext<Token> =
730            match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
731                Ok(pc) => pc,
732                Err(e) => {
733                    error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
734                    return;
735                }
736            };
737
738        if let Some(tube) = &self.unpin_tube {
739            if let Err(e) = wait_ctx.add(tube, Token::UnpinReq) {
740                error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
741                return;
742            }
743        }
744
745        let mut unpin_timer = if self.params.unpin_policy != CoIommuUnpinPolicy::Off
746            && !self.params.unpin_interval.is_zero()
747        {
748            let mut timer = match Timer::new() {
749                Ok(t) => t,
750                Err(e) => {
751                    error!(
752                        "{}: failed to create the unpin timer: {}",
753                        self.debug_label(),
754                        e
755                    );
756                    return;
757                }
758            };
759            if let Err(e) = timer.reset_repeating(self.params.unpin_interval) {
760                error!(
761                    "{}: failed to start the unpin timer: {}",
762                    self.debug_label(),
763                    e
764                );
765                return;
766            }
767            if let Err(e) = wait_ctx.add(&timer, Token::UnpinTimer) {
768                error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
769                return;
770            }
771            Some(timer)
772        } else {
773            None
774        };
775
776        let unpin_tube = self.unpin_tube.take();
777        'wait: loop {
778            let events = match wait_ctx.wait() {
779                Ok(v) => v,
780                Err(e) => {
781                    error!("{}: failed polling for events: {}", self.debug_label(), e);
782                    break;
783                }
784            };
785
786            for event in events.iter().filter(|e| e.is_readable) {
787                match event.token {
788                    Token::UnpinTimer => {
789                        self.unpin_pages();
790                        if let Some(timer) = &mut unpin_timer {
791                            if let Err(e) = timer.mark_waited() {
792                                error!(
793                                    "{}: failed to clear unpin timer: {}",
794                                    self.debug_label(),
795                                    e
796                                );
797                                break 'wait;
798                            }
799                        }
800                    }
801                    Token::UnpinReq => {
802                        if let Some(tube) = &unpin_tube {
803                            match tube.recv::<UnpinRequest>() {
804                                Ok(req) => {
805                                    let mut unpin_done = true;
806                                    for range in req.ranges {
807                                        // Locking with respect to pin_pages isn't necessary
808                                        // for this case because the unpinned pages in the range
809                                        // should all be in the balloon and so nothing will attempt
810                                        // to pin them.
811                                        if !self.unpin_pages_in_range(range.0, range.1) {
812                                            unpin_done = false;
813                                            break;
814                                        }
815                                    }
816                                    let resp = if unpin_done {
817                                        UnpinResponse::Success
818                                    } else {
819                                        UnpinResponse::Failed
820                                    };
821                                    if let Err(e) = tube.send(&resp) {
822                                        error!(
823                                            "{}: failed to send unpin response {}",
824                                            self.debug_label(),
825                                            e
826                                        );
827                                    }
828                                }
829                                Err(e) => {
830                                    if let TubeError::Disconnected = e {
831                                        if let Err(e) = wait_ctx.delete(tube) {
832                                            error!(
833                                                "{}: failed to remove unpin_tube: {}",
834                                                self.debug_label(),
835                                                e
836                                            );
837                                        }
838                                    } else {
839                                        error!(
840                                            "{}: failed to recv Unpin Request: {}",
841                                            self.debug_label(),
842                                            e
843                                        );
844                                    }
845                                }
846                            }
847                        }
848                    }
849                    Token::Kill => break 'wait,
850                }
851            }
852        }
853        self.unpin_tube = unpin_tube;
854    }
855
856    fn unpin_pages(&mut self) {
857        if self.params.unpin_policy == CoIommuUnpinPolicy::Lru {
858            self.lru_unpin_pages();
859        }
860    }
861
862    fn lru_unpin_page(
863        &mut self,
864        dtt_iter: &mut DTTIter,
865        new_gen: bool,
866    ) -> (UnpinResult, Option<PinnedPageInfo>) {
867        let mut pinstate = self.pinstate.lock();
868        let pageinfo = if new_gen {
869            pinstate.new_gen_pinned_pages.pop_front()
870        } else {
871            pinstate
872                .old_gen_pinned_pages
873                .pop_front()
874                .map(|gfn| PinnedPageInfo::new(gfn, 0))
875        };
876
877        pageinfo.map_or((UnpinResult::UnpinlistEmpty, None), |pageinfo| {
878            (
879                unpin_page(
880                    &mut pinstate,
881                    &self.vfio_container,
882                    &self.mem,
883                    self.dtt_level,
884                    self.dtt_root,
885                    dtt_iter,
886                    pageinfo.gfn,
887                    false,
888                ),
889                Some(pageinfo),
890            )
891        })
892    }
893
894    fn lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64 {
895        let mut not_unpinned_new_gen_pages = VecDeque::new();
896        let mut not_unpinned_old_gen_pages = VecDeque::new();
897        let mut unpinned_count = 0;
898        let has_limit = unpin_limit.is_some();
899        let limit_count = unpin_limit.unwrap_or(0);
900        let mut dtt_iter: DTTIter = Default::default();
901
902        // If has_limit is true but limit_count is 0, will not do the unpin
903        while !has_limit || unpinned_count != limit_count {
904            let (result, pinned_page) = self.lru_unpin_page(&mut dtt_iter, new_gen);
905            match result {
906                UnpinResult::UnpinlistEmpty => break,
907                UnpinResult::Unpinned => unpinned_count += 1,
908                UnpinResult::NotPinned => {}
909                UnpinResult::NotUnpinned => {
910                    if let Some(mut page) = pinned_page {
911                        if self.params.unpin_gen_threshold != 0 {
912                            page.unpin_busy_cnt += 1;
913                            // Unpin from new_gen queue but not
914                            // successfully unpinned. Need to check
915                            // the unpin_gen threshold. If reach, put
916                            // it to old_gen queue.
917                            // And if it is not from new_gen, directly
918                            // put into old_gen queue.
919                            if !new_gen || page.unpin_busy_cnt >= self.params.unpin_gen_threshold {
920                                not_unpinned_old_gen_pages.push_back(page.gfn);
921                            } else {
922                                not_unpinned_new_gen_pages.push_back(page);
923                            }
924                        }
925                    }
926                }
927                UnpinResult::FailedUnpin | UnpinResult::UnpinParked => {
928                    // Although UnpinParked means we didn't actually try to unpin
929                    // gfn, it's not worth specifically handing since parking is
930                    // expected to be relatively rare.
931                    if let Some(page) = pinned_page {
932                        if new_gen {
933                            not_unpinned_new_gen_pages.push_back(page);
934                        } else {
935                            not_unpinned_old_gen_pages.push_back(page.gfn);
936                        }
937                    }
938                    if result == UnpinResult::UnpinParked {
939                        thread::park();
940                    }
941                }
942            }
943        }
944
945        if !not_unpinned_new_gen_pages.is_empty() {
946            let mut pinstate = self.pinstate.lock();
947            pinstate
948                .new_gen_pinned_pages
949                .append(&mut not_unpinned_new_gen_pages);
950        }
951
952        if !not_unpinned_old_gen_pages.is_empty() {
953            let mut pinstate = self.pinstate.lock();
954            pinstate
955                .old_gen_pinned_pages
956                .append(&mut not_unpinned_old_gen_pages);
957        }
958
959        unpinned_count
960    }
961
962    fn lru_unpin_pages(&mut self) {
963        let mut unpin_count = 0;
964        if self.params.unpin_gen_threshold != 0 {
965            self.unpin_gen_threshold += 1;
966            if self.unpin_gen_threshold == self.params.unpin_gen_threshold {
967                self.unpin_gen_threshold = 0;
968                // Try to unpin inactive queue first if reaches the thres hold
969                unpin_count = self.lru_unpin_pages_in_loop(self.params.unpin_limit, false);
970            }
971        }
972        // Unpin the new_gen queue with the updated unpin_limit after unpin old_gen queue
973        self.lru_unpin_pages_in_loop(
974            self.params
975                .unpin_limit
976                .map(|limit| limit.saturating_sub(unpin_count)),
977            true,
978        );
979    }
980
981    fn unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool {
982        let mut dtt_iter: DTTIter = Default::default();
983        let mut index = 0;
984        while index != count {
985            let mut pinstate = self.pinstate.lock();
986            let result = unpin_page(
987                &mut pinstate,
988                &self.vfio_container,
989                &self.mem,
990                self.dtt_level,
991                self.dtt_root,
992                &mut dtt_iter,
993                gfn + index,
994                true,
995            );
996            drop(pinstate);
997
998            match result {
999                UnpinResult::Unpinned | UnpinResult::NotPinned => {}
1000                UnpinResult::UnpinParked => {
1001                    thread::park();
1002                    continue;
1003                }
1004                _ => {
1005                    error!("coiommu: force unpin failed by {:?}", result);
1006                    return false;
1007                }
1008            }
1009            index += 1;
1010        }
1011        true
1012    }
1013}
1014
1015pub struct CoIommuDev {
1016    config_regs: PciConfiguration,
1017    pci_address: Option<PciAddress>,
1018    mem: GuestMemory,
1019    coiommu_reg: CoIommuReg,
1020    endpoints: Vec<u16>,
1021    notifymap_mem: SafeDescriptor,
1022    notifymap_mmap: Arc<MemoryMapping>,
1023    notifymap_addr: Option<u64>,
1024    topologymap_mem: SafeDescriptor,
1025    topologymap_addr: Option<u64>,
1026    mmapped: bool,
1027    vm_memory_client: VmMemoryClient,
1028    pin_thread: Option<WorkerThread<PinWorker>>,
1029    unpin_thread: Option<WorkerThread<UnpinWorker>>,
1030    unpin_tube: Option<Tube>,
1031    ioevents: Vec<Event>,
1032    vfio_container: Arc<Mutex<VfioContainer>>,
1033    pinstate: Arc<Mutex<CoIommuPinState>>,
1034    params: CoIommuParameters,
1035}
1036
1037impl CoIommuDev {
1038    pub fn new(
1039        mem: GuestMemory,
1040        vfio_container: Arc<Mutex<VfioContainer>>,
1041        vm_memory_client: VmMemoryClient,
1042        unpin_tube: Option<Tube>,
1043        endpoints: Vec<u16>,
1044        vcpu_count: u64,
1045        params: CoIommuParameters,
1046    ) -> Result<Self> {
1047        let config_regs = PciConfiguration::new(
1048            PCI_VENDOR_ID_COIOMMU,
1049            PCI_DEVICE_ID_COIOMMU,
1050            PciClassCode::Other,
1051            &PciOtherSubclass::Other,
1052            None, // No Programming interface.
1053            PciHeaderType::Device,
1054            PCI_VENDOR_ID_COIOMMU,
1055            PCI_DEVICE_ID_COIOMMU,
1056            COIOMMU_REVISION_ID,
1057        );
1058
1059        // notifymap_mem is used as Bar2 for Guest to check if request is completed by coIOMMU.
1060        let notifymap_mem = SharedMemory::new("coiommu_notifymap", COIOMMU_NOTIFYMAP_SIZE as u64)
1061            .context(Error::CreateSharedMemory)?;
1062        let notifymap_mmap = Arc::new(
1063            MemoryMappingBuilder::new(COIOMMU_NOTIFYMAP_SIZE)
1064                .from_shared_memory(&notifymap_mem)
1065                .offset(0)
1066                .build()?,
1067        );
1068
1069        // topologymap_mem is used as Bar4 for Guest to check which device is on top of coIOMMU.
1070        let topologymap_mem =
1071            SharedMemory::new("coiommu_topologymap", COIOMMU_TOPOLOGYMAP_SIZE as u64)
1072                .context(Error::CreateSharedMemory)?;
1073        let topologymap_mmap = Arc::new(
1074            MemoryMappingBuilder::new(COIOMMU_TOPOLOGYMAP_SIZE)
1075                .from_shared_memory(&topologymap_mem)
1076                .offset(0)
1077                .build()?,
1078        );
1079
1080        ensure!(
1081            (endpoints.len() + 1) * mem::size_of::<u16>() <= COIOMMU_TOPOLOGYMAP_SIZE,
1082            "Coiommu: too many endpoints"
1083        );
1084        topologymap_mmap.write_obj::<u16>(endpoints.len() as u16, 0)?;
1085        for (index, endpoint) in endpoints.iter().enumerate() {
1086            topologymap_mmap.write_obj::<u16>(*endpoint, (index + 1) * mem::size_of::<u16>())?;
1087        }
1088
1089        let mut ioevents = Vec::new();
1090        for _ in 0..vcpu_count {
1091            ioevents.push(Event::new().context("CoIommu failed to create event fd")?);
1092        }
1093
1094        Ok(Self {
1095            config_regs,
1096            pci_address: None,
1097            mem,
1098            coiommu_reg: Default::default(),
1099            endpoints,
1100            notifymap_mem: notifymap_mem.into(),
1101            notifymap_mmap,
1102            notifymap_addr: None,
1103            topologymap_mem: topologymap_mem.into(),
1104            topologymap_addr: None,
1105            mmapped: false,
1106            vm_memory_client,
1107            pin_thread: None,
1108            unpin_thread: None,
1109            unpin_tube,
1110            ioevents,
1111            vfio_container,
1112            pinstate: Arc::new(Mutex::new(CoIommuPinState {
1113                new_gen_pinned_pages: VecDeque::new(),
1114                old_gen_pinned_pages: VecDeque::new(),
1115                unpin_thread_state: UnpinThreadState::Unparked,
1116                unpin_park_count: 0,
1117            })),
1118            params,
1119        })
1120    }
1121
1122    fn register_mmap(
1123        &self,
1124        descriptor: SafeDescriptor,
1125        size: usize,
1126        offset: u64,
1127        gpa: u64,
1128        prot: Protection,
1129    ) -> Result<()> {
1130        let _region = self
1131            .vm_memory_client
1132            .register_memory(
1133                VmMemorySource::Descriptor {
1134                    descriptor,
1135                    offset,
1136                    size: size as u64,
1137                },
1138                VmMemoryDestination::GuestPhysicalAddress(gpa),
1139                prot,
1140                MemCacheType::CacheCoherent,
1141            )
1142            .context("register_mmap register_memory failed")?;
1143        Ok(())
1144    }
1145
1146    fn mmap(&mut self) {
1147        if self.mmapped {
1148            return;
1149        }
1150
1151        if let Some(gpa) = self.notifymap_addr {
1152            match self.register_mmap(
1153                self.notifymap_mem.try_clone().unwrap(),
1154                COIOMMU_NOTIFYMAP_SIZE,
1155                0,
1156                gpa,
1157                Protection::read_write(),
1158            ) {
1159                Ok(_) => {}
1160                Err(e) => {
1161                    panic!("{}: map notifymap failed: {}", self.debug_label(), e);
1162                }
1163            }
1164        }
1165
1166        if let Some(gpa) = self.topologymap_addr {
1167            match self.register_mmap(
1168                self.topologymap_mem.try_clone().unwrap(),
1169                COIOMMU_TOPOLOGYMAP_SIZE,
1170                0,
1171                gpa,
1172                Protection::read(),
1173            ) {
1174                Ok(_) => {}
1175                Err(e) => {
1176                    panic!("{}: map topologymap failed: {}", self.debug_label(), e);
1177                }
1178            }
1179        }
1180
1181        self.mmapped = true;
1182    }
1183
1184    fn start_workers(&mut self) {
1185        if self.pin_thread.is_none() {
1186            self.start_pin_thread();
1187        }
1188
1189        if self.unpin_thread.is_none() {
1190            self.start_unpin_thread();
1191        }
1192    }
1193
1194    fn start_pin_thread(&mut self) {
1195        let mem = self.mem.clone();
1196        let endpoints = self.endpoints.to_vec();
1197        let notifymap_mmap = self.notifymap_mmap.clone();
1198        let dtt_root = self.coiommu_reg.dtt_root;
1199        let dtt_level = self.coiommu_reg.dtt_level;
1200        let ioevents: Vec<Event> = self
1201            .ioevents
1202            .iter()
1203            .map(|e| e.try_clone().unwrap())
1204            .collect();
1205
1206        let bar0 = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR);
1207        let notify_base = bar0 + mem::size_of::<CoIommuReg>() as u64;
1208        for (i, evt) in self.ioevents.iter().enumerate() {
1209            self.vm_memory_client
1210                .register_io_event(
1211                    evt.try_clone().expect("failed to clone event"),
1212                    notify_base + i as u64,
1213                    Datamatch::AnyLength,
1214                )
1215                .expect("failed to register ioevent");
1216        }
1217
1218        let vfio_container = self.vfio_container.clone();
1219        let pinstate = self.pinstate.clone();
1220        let params = self.params;
1221
1222        self.pin_thread = Some(WorkerThread::start("coiommu_pin", move |kill_evt| {
1223            let mut worker = PinWorker {
1224                mem,
1225                endpoints,
1226                notifymap_mmap,
1227                dtt_root,
1228                dtt_level,
1229                ioevents,
1230                vfio_container,
1231                pinstate,
1232                params,
1233            };
1234            worker.run(kill_evt);
1235            worker
1236        }));
1237    }
1238
1239    fn start_unpin_thread(&mut self) {
1240        let mem = self.mem.clone();
1241        let dtt_root = self.coiommu_reg.dtt_root;
1242        let dtt_level = self.coiommu_reg.dtt_level;
1243        let vfio_container = self.vfio_container.clone();
1244        let unpin_tube = self.unpin_tube.take();
1245        let pinstate = self.pinstate.clone();
1246        let params = self.params;
1247        self.unpin_thread = Some(WorkerThread::start("coiommu_unpin", move |kill_evt| {
1248            let mut worker = UnpinWorker {
1249                mem,
1250                dtt_level,
1251                dtt_root,
1252                vfio_container,
1253                unpin_tube,
1254                pinstate,
1255                params,
1256                unpin_gen_threshold: 0,
1257            };
1258            worker.run(kill_evt);
1259            worker
1260        }));
1261    }
1262
1263    fn allocate_bar_address(
1264        &mut self,
1265        resources: &mut SystemAllocator,
1266        address: PciAddress,
1267        size: u64,
1268        bar_num: u8,
1269        name: &str,
1270    ) -> PciResult<u64> {
1271        let addr = resources
1272            .allocate_mmio(
1273                size,
1274                Alloc::PciBar {
1275                    bus: address.bus,
1276                    dev: address.dev,
1277                    func: address.func,
1278                    bar: bar_num,
1279                },
1280                name.to_string(),
1281                AllocOptions::new().prefetchable(true).align(size),
1282            )
1283            .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1284
1285        let bar = PciBarConfiguration::new(
1286            bar_num as usize,
1287            size,
1288            PciBarRegionType::Memory64BitRegion,
1289            PciBarPrefetchable::Prefetchable,
1290        )
1291        .set_address(addr);
1292
1293        self.config_regs
1294            .add_pci_bar(bar)
1295            .map_err(|e| PciDeviceError::IoRegistrationFailed(addr, e))?;
1296
1297        Ok(addr)
1298    }
1299
1300    fn read_mmio(&mut self, offset: u64, data: &mut [u8]) {
1301        if offset >= mem::size_of::<CoIommuReg>() as u64 {
1302            error!(
1303                "{}: read_mmio: invalid offset 0x{:x}",
1304                self.debug_label(),
1305                offset
1306            );
1307            return;
1308        }
1309
1310        // Sanity check, must be 64bit aligned accessing
1311        if offset % 8 != 0 || data.len() != 8 {
1312            error!(
1313                "{}: read_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1314                self.debug_label(),
1315                offset,
1316                data.len()
1317            );
1318            return;
1319        }
1320
1321        let v = match offset / 8 {
1322            0 => self.coiommu_reg.dtt_root,
1323            1 => self.coiommu_reg.cmd,
1324            2 => self.coiommu_reg.dtt_level,
1325            _ => return,
1326        };
1327
1328        data.copy_from_slice(&v.to_ne_bytes());
1329    }
1330
1331    fn write_mmio(&mut self, offset: u64, data: &[u8]) {
1332        let mmio_len = mem::size_of::<CoIommuReg>() as u64;
1333        if offset >= mmio_len {
1334            if data.len() != 1 {
1335                error!(
1336                    "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 1",
1337                    self.debug_label(),
1338                    offset,
1339                    data.len()
1340                );
1341                return;
1342            }
1343
1344            // Usually will not be here as this is for the per-vcpu notify
1345            // register which is monitored by the ioevents. For the notify
1346            // register which is not covered by the ioevents, they are not
1347            // be used by the frontend driver. In case the frontend driver
1348            // went here, do a simple handle to make sure the frontend driver
1349            // will not be blocked, and through an error log.
1350            let index = (offset - mmio_len) as usize;
1351            if let Some(event) = self.ioevents.get(index) {
1352                let _ = event.signal();
1353            } else {
1354                self.notifymap_mmap
1355                    .write_obj::<u64>(0, index * mem::size_of::<u64>())
1356                    .unwrap();
1357                error!(
1358                    "{}: No page will be pinned as driver is accessing unused trigger register: offset 0x{:x}",
1359                    self.debug_label(),
1360                    offset
1361                );
1362            }
1363            return;
1364        }
1365
1366        // Sanity check, must be 64bit aligned accessing for CoIommuReg
1367        if offset % 8 != 0 || data.len() != 8 {
1368            error!(
1369                "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1370                self.debug_label(),
1371                offset,
1372                data.len()
1373            );
1374            return;
1375        }
1376
1377        let index = offset / 8;
1378        let v = u64::from_ne_bytes(data.try_into().unwrap());
1379        match index {
1380            0 => {
1381                if self.coiommu_reg.dtt_root == 0 {
1382                    self.coiommu_reg.dtt_root = v;
1383                }
1384            }
1385            1 => match v {
1386                // Deactivate can happen if the frontend driver in the guest
1387                // fails during probing or if the CoIommu device is removed
1388                // by the guest. Neither of these cases is expected, and if
1389                // either happens the guest will be non-functional due to
1390                // pass-through devices which rely on CoIommu not working.
1391                // So just fail hard and panic.
1392                COIOMMU_CMD_DEACTIVATE => {
1393                    panic!("{}: Deactivate is not supported", self.debug_label())
1394                }
1395                COIOMMU_CMD_ACTIVATE => {
1396                    if self.coiommu_reg.dtt_root != 0 && self.coiommu_reg.dtt_level != 0 {
1397                        self.start_workers();
1398                    }
1399                }
1400                COIOMMU_CMD_PARK_UNPIN => {
1401                    let mut pinstate = self.pinstate.lock();
1402                    pinstate.unpin_thread_state = UnpinThreadState::Parked;
1403                    if let Some(v) = pinstate.unpin_park_count.checked_add(1) {
1404                        pinstate.unpin_park_count = v;
1405                    } else {
1406                        panic!("{}: Park request overflowing", self.debug_label());
1407                    }
1408                }
1409                COIOMMU_CMD_UNPARK_UNPIN => {
1410                    let mut pinstate = self.pinstate.lock();
1411                    if pinstate.unpin_thread_state == UnpinThreadState::Parked {
1412                        if let Some(v) = pinstate.unpin_park_count.checked_sub(1) {
1413                            pinstate.unpin_park_count = v;
1414                            if pinstate.unpin_park_count == 0 {
1415                                if let Some(worker_thread) = &self.unpin_thread {
1416                                    worker_thread.thread().unpark();
1417                                }
1418                                pinstate.unpin_thread_state = UnpinThreadState::Unparked;
1419                            }
1420                        } else {
1421                            error!("{}: Park count is already reached to 0", self.debug_label());
1422                        }
1423                    }
1424                }
1425                _ => {}
1426            },
1427            2 => {
1428                if self.coiommu_reg.dtt_level == 0 {
1429                    self.coiommu_reg.dtt_level = v;
1430                }
1431            }
1432            _ => {}
1433        }
1434    }
1435}
1436
1437impl PciDevice for CoIommuDev {
1438    fn debug_label(&self) -> String {
1439        "CoIommu".to_owned()
1440    }
1441
1442    fn allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress> {
1443        if self.pci_address.is_none() {
1444            self.pci_address = resources.allocate_pci(0, self.debug_label());
1445        }
1446        self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1447    }
1448
1449    fn allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>> {
1450        let address = self
1451            .pci_address
1452            .expect("allocate_address must be called prior to allocate_io_bars");
1453
1454        // Allocate one bar for the structures pointed to by the capability structures.
1455        let mut ranges: Vec<BarRange> = Vec::new();
1456
1457        let mmio_addr = self.allocate_bar_address(
1458            resources,
1459            address,
1460            COIOMMU_MMIO_BAR_SIZE,
1461            COIOMMU_MMIO_BAR as u8,
1462            "coiommu-mmiobar",
1463        )?;
1464
1465        ranges.push(BarRange {
1466            addr: mmio_addr,
1467            size: COIOMMU_MMIO_BAR_SIZE,
1468            prefetchable: false,
1469        });
1470
1471        Ok(ranges)
1472    }
1473
1474    fn allocate_device_bars(
1475        &mut self,
1476        resources: &mut SystemAllocator,
1477    ) -> PciResult<Vec<BarRange>> {
1478        let address = self
1479            .pci_address
1480            .expect("allocate_address must be called prior to allocate_device_bars");
1481
1482        let mut ranges: Vec<BarRange> = Vec::new();
1483
1484        let topologymap_addr = self.allocate_bar_address(
1485            resources,
1486            address,
1487            COIOMMU_TOPOLOGYMAP_SIZE as u64,
1488            COIOMMU_TOPOLOGYMAP_BAR,
1489            "coiommu-topology",
1490        )?;
1491        self.topologymap_addr = Some(topologymap_addr);
1492        ranges.push(BarRange {
1493            addr: topologymap_addr,
1494            size: COIOMMU_TOPOLOGYMAP_SIZE as u64,
1495            prefetchable: false,
1496        });
1497
1498        let notifymap_addr = self.allocate_bar_address(
1499            resources,
1500            address,
1501            COIOMMU_NOTIFYMAP_SIZE as u64,
1502            COIOMMU_NOTIFYMAP_BAR as u8,
1503            "coiommu-notifymap",
1504        )?;
1505        self.notifymap_addr = Some(notifymap_addr);
1506        ranges.push(BarRange {
1507            addr: notifymap_addr,
1508            size: COIOMMU_NOTIFYMAP_SIZE as u64,
1509            prefetchable: false,
1510        });
1511
1512        Ok(ranges)
1513    }
1514
1515    fn read_config_register(&self, reg_idx: usize) -> u32 {
1516        self.config_regs.read_reg(reg_idx)
1517    }
1518
1519    fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1520        if reg_idx == COMMAND_REG
1521            && data.len() == 2
1522            && data[0] & COMMAND_REG_MEMORY_SPACE_MASK as u8 != 0
1523            && !self.mmapped
1524        {
1525            self.mmap();
1526        }
1527
1528        self.config_regs.write_reg(reg_idx, offset, data);
1529    }
1530
1531    fn keep_rds(&self) -> Vec<RawDescriptor> {
1532        let mut rds = vec![
1533            self.vfio_container.lock().as_raw_descriptor(),
1534            self.vm_memory_client.as_raw_descriptor(),
1535            self.notifymap_mem.as_raw_descriptor(),
1536            self.topologymap_mem.as_raw_descriptor(),
1537        ];
1538        if let Some(unpin_tube) = &self.unpin_tube {
1539            rds.push(unpin_tube.as_raw_descriptor());
1540        }
1541        rds.extend(self.ioevents.iter().map(Event::as_raw_descriptor));
1542        rds
1543    }
1544
1545    fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
1546        match bar_index {
1547            COIOMMU_MMIO_BAR => self.read_mmio(offset, data),
1548            COIOMMU_NOTIFYMAP_BAR => {
1549                // With coiommu device activated, the accessing the notifymap bar
1550                // won't cause vmexit. If goes here, means the coiommu device is
1551                // deactivated, and will not do the pin/unpin work. Thus no need
1552                // to handle this notifymap read.
1553            }
1554            _ => {}
1555        }
1556    }
1557
1558    fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
1559        match bar_index {
1560            COIOMMU_MMIO_BAR => self.write_mmio(offset, data),
1561            COIOMMU_NOTIFYMAP_BAR => {
1562                // With coiommu device activated, the accessing the notifymap bar
1563                // won't cause vmexit. If goes here, means the coiommu device is
1564                // deactivated, and will not do the pin/unpin work. Thus no need
1565                // to handle this notifymap write.
1566            }
1567            _ => {}
1568        }
1569    }
1570
1571    fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1572        self.config_regs.get_bar_configuration(bar_num)
1573    }
1574}
1575
1576impl Suspendable for CoIommuDev {}