devices/irqchip/
userspace.rs

1// Copyright 2020 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::convert::TryFrom;
6use std::convert::TryInto;
7use std::fmt;
8use std::fmt::Display;
9use std::iter;
10use std::sync::Arc;
11
12cfg_if::cfg_if! {
13    if #[cfg(test)] {
14        use base::{FakeClock as Clock, FakeTimer as Timer};
15    } else {
16        use base::{Clock, Timer};
17    }
18}
19use base::error;
20use base::info;
21use base::warn;
22use base::AsRawDescriptor;
23use base::Descriptor;
24use base::Error;
25use base::Event;
26use base::EventToken;
27use base::Result;
28use base::Tube;
29use base::WaitContext;
30use base::WorkerThread;
31use hypervisor::DeliveryMode;
32use hypervisor::IoapicState;
33use hypervisor::IrqRoute;
34use hypervisor::IrqSource;
35use hypervisor::IrqSourceChip;
36use hypervisor::LapicState;
37use hypervisor::MPState;
38use hypervisor::MsiAddressMessage;
39use hypervisor::MsiDataMessage;
40use hypervisor::PicSelect;
41use hypervisor::PicState;
42use hypervisor::PitState;
43use hypervisor::Vcpu;
44use hypervisor::VcpuX86_64;
45use resources::SystemAllocator;
46use snapshot::AnySnapshot;
47use sync::Condvar;
48use sync::Mutex;
49
50use crate::bus::BusDeviceSync;
51use crate::irqchip::Apic;
52use crate::irqchip::ApicBusMsg;
53use crate::irqchip::DelayedIoApicIrqEvents;
54use crate::irqchip::Interrupt;
55use crate::irqchip::InterruptData;
56use crate::irqchip::InterruptDestination;
57use crate::irqchip::Ioapic;
58use crate::irqchip::IrqEvent;
59use crate::irqchip::IrqEventIndex;
60use crate::irqchip::Pic;
61use crate::irqchip::Routes;
62use crate::irqchip::VcpuRunState;
63use crate::irqchip::APIC_BASE_ADDRESS;
64use crate::irqchip::APIC_MEM_LENGTH_BYTES;
65use crate::irqchip::IOAPIC_BASE_ADDRESS;
66use crate::irqchip::IOAPIC_MEM_LENGTH_BYTES;
67use crate::pci::CrosvmDeviceId;
68use crate::Bus;
69use crate::BusAccessInfo;
70use crate::BusDevice;
71use crate::DeviceId;
72use crate::IrqChip;
73use crate::IrqChipCap;
74use crate::IrqChipX86_64;
75use crate::IrqEdgeEvent;
76use crate::IrqEventSource;
77use crate::IrqLevelEvent;
78use crate::Pit;
79use crate::PitError;
80use crate::Suspendable;
81
82/// PIT channel 0 timer is connected to IRQ 0
83const PIT_CHANNEL0_IRQ: u32 = 0;
84/// CR0 extension type bit
85const X86_CR0_ET: u64 = 0x00000010;
86/// CR0 not write through bit
87const X86_CR0_NW: u64 = 0x20000000;
88/// CR0 cache disable bit
89const X86_CR0_CD: u64 = 0x40000000;
90/// Default power on state of CR0 register, according to the Intel manual.
91const X86_CR0_INIT: u64 = X86_CR0_ET | X86_CR0_NW | X86_CR0_CD;
92
93/// An `IrqChip` with all interrupt devices emulated in userspace.  `UserspaceIrqChip` works with
94/// any hypervisor, but only supports x86.
95pub struct UserspaceIrqChip<V: VcpuX86_64> {
96    pub vcpus: Arc<Mutex<Vec<Option<V>>>>,
97    routes: Arc<Mutex<Routes>>,
98    pit: Arc<Mutex<Pit>>,
99    pic: Arc<Mutex<Pic>>,
100    ioapic: Arc<Mutex<Ioapic>>,
101    ioapic_pins: usize,
102    pub apics: Vec<Arc<Mutex<Apic>>>,
103    // Condition variables used by wait_until_runnable.
104    waiters: Vec<Arc<Waiter>>,
105    // Raw descriptors of the apic Timers.
106    timer_descriptors: Vec<Descriptor>,
107    /// Delayed ioapic irq object, that contains the delayed events because the ioapic was locked
108    /// when service_irq was called on the irqchip. This prevents deadlocks when a Vcpu thread has
109    /// locked the ioapic and the ioapic sends a AddMsiRoute signal to the main thread (which
110    /// itself may be busy trying to call service_irq).
111    ///
112    /// ## Note:
113    /// This lock may be locked by itself to access the `DelayedIoApicIrqEvents`. If accessed in
114    /// conjunction with the `irq_events` field, that lock should be taken first to prevent
115    /// deadlocks stemming from lock-ordering issues.
116    delayed_ioapic_irq_events: Arc<Mutex<DelayedIoApicIrqEvents>>,
117    // Array of Events that devices will use to assert ioapic pins.
118    irq_events: Arc<Mutex<Vec<Option<IrqEvent>>>>,
119    dropper: Arc<Mutex<Dropper>>,
120    activated: bool,
121}
122
123/// Helper that implements `Drop` on behalf of `UserspaceIrqChip`.  The many cloned copies of an irq
124/// chip share a single arc'ed `Dropper`, which only runs its drop when the last irq chip copy is
125/// dropped.
126struct Dropper {
127    /// Worker threads that deliver timer events to the APICs.
128    workers: Vec<WorkerThread<()>>,
129}
130
131impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> {
132    /// Constructs a new `UserspaceIrqChip`.
133    pub fn new(num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>) -> Result<Self> {
134        let clock = Arc::new(Mutex::new(Clock::new()));
135        Self::new_with_clock(num_vcpus, irq_tube, ioapic_pins, clock)
136    }
137
138    /// Constructs a new `UserspaceIrqChip`, with a clock.  Used for testing.
139    pub fn new_with_clock(
140        num_vcpus: usize,
141        irq_tube: Tube,
142        ioapic_pins: Option<usize>,
143        clock: Arc<Mutex<Clock>>,
144    ) -> Result<Self> {
145        let pit_evt = IrqEdgeEvent::new()?;
146        // For test only, this clock instance is FakeClock. It needs to be cloned for every Timer
147        // instance, so make a clone for it now.
148        #[cfg(test)]
149        let test_clock = clock.clone();
150        let pit = Pit::new(pit_evt.try_clone()?, clock).map_err(|e| match e {
151            PitError::CloneEvent(err) => err,
152            PitError::CreateEvent(err) => err,
153            PitError::CreateWaitContext(err) => err,
154            PitError::TimerCreateError(err) => err,
155            PitError::WaitError(err) => err,
156            PitError::SpawnThread(_) => Error::new(libc::EIO),
157        })?;
158        let pit_event_source = IrqEventSource::from_device(&pit);
159
160        let ioapic_pins = ioapic_pins.unwrap_or(hypervisor::NUM_IOAPIC_PINS);
161        let ioapic = Ioapic::new(irq_tube, ioapic_pins)?;
162
163        let mut timer_descriptors: Vec<Descriptor> = Vec::with_capacity(num_vcpus);
164        let mut apics: Vec<Arc<Mutex<Apic>>> = Vec::with_capacity(num_vcpus);
165        for id in 0..num_vcpus {
166            cfg_if::cfg_if! {
167                if #[cfg(test)] {
168                    let timer = Timer::new(test_clock.clone());
169                } else {
170                    let timer = Timer::new()?;
171                }
172            }
173            // Timers are owned by the apics, which outlive the raw descriptors stored here and in
174            // the worker threads.
175            timer_descriptors.push(Descriptor(timer.as_raw_descriptor()));
176
177            let id: u8 = id.try_into().or(Err(Error::new(libc::EINVAL)))?;
178            let apic = Apic::new(id, Box::new(timer));
179            apics.push(Arc::new(Mutex::new(apic)));
180        }
181        let dropper = Dropper {
182            workers: Vec::new(),
183        };
184
185        let mut chip = UserspaceIrqChip {
186            vcpus: Arc::new(Mutex::new(
187                iter::repeat_with(|| None).take(num_vcpus).collect(),
188            )),
189            waiters: iter::repeat_with(Default::default)
190                .take(num_vcpus)
191                .collect(),
192            routes: Arc::new(Mutex::new(Routes::new())),
193            pit: Arc::new(Mutex::new(pit)),
194            pic: Arc::new(Mutex::new(Pic::new())),
195            ioapic: Arc::new(Mutex::new(ioapic)),
196            ioapic_pins,
197            apics,
198            timer_descriptors,
199            delayed_ioapic_irq_events: Arc::new(Mutex::new(DelayedIoApicIrqEvents::new()?)),
200            irq_events: Arc::new(Mutex::new(Vec::new())),
201            dropper: Arc::new(Mutex::new(dropper)),
202            activated: false,
203        };
204
205        // Setup standard x86 irq routes
206        chip.set_irq_routes(&Routes::default_pic_ioapic_routes(ioapic_pins))?;
207
208        chip.register_edge_irq_event(PIT_CHANNEL0_IRQ, &pit_evt, pit_event_source)?;
209        Ok(chip)
210    }
211
212    /// Handles a message from an APIC.
213    fn handle_msg(&self, msg: ApicBusMsg) {
214        match msg {
215            ApicBusMsg::Eoi(vector) => {
216                let _ = self.broadcast_eoi(vector);
217            }
218            ApicBusMsg::Ipi(interrupt) => self.send_irq_to_apics(&interrupt),
219        }
220    }
221
222    /// Sends a Message Signaled Interrupt to one or more APICs.  MSIs are a 64-bit address and
223    /// 32-bit data, but in the Intel spec we're implementing, only the low 32 bits of the address
224    /// are used.
225    fn send_msi(&self, addr: u32, data: u32) {
226        let mut msi_addr = MsiAddressMessage::new();
227        msi_addr.set(0, 32, addr as u64);
228        let dest = match InterruptDestination::try_from(&msi_addr) {
229            Ok(dest) => dest,
230            Err(e) => {
231                warn!("Invalid MSI message: {}", e);
232                return;
233            }
234        };
235
236        let mut msi_data = MsiDataMessage::new();
237        msi_data.set(0, 32, data as u64);
238        let data = InterruptData::from(&msi_data);
239
240        self.send_irq_to_apics(&Interrupt { dest, data });
241    }
242
243    pub fn send_irq_to_apic(&self, id: usize, irq: &InterruptData) {
244        // id can come from the guest, so check bounds.
245        if let Some(apic) = self.apics.get(id) {
246            apic.lock().accept_irq(irq);
247        } else {
248            error!("Interrupt for non-existent apic {}: {:?}", id, irq);
249        }
250        if let Some(Some(vcpu)) = self.vcpus.lock().get(id) {
251            vcpu.set_interrupt_window_requested(true);
252        } else {
253            error!("Interrupt for non-existent vcpu {}: {:?}", id, irq);
254        }
255        self.waiters[id].notify();
256    }
257
258    /// Sends an interrupt to one or more APICs.  Used for sending MSIs and IPIs.
259    pub fn send_irq_to_apics(&self, irq: &Interrupt) {
260        match irq.data.delivery {
261            DeliveryMode::Fixed | DeliveryMode::Lowest | DeliveryMode::RemoteRead => {}
262            _ => info!("UserspaceIrqChip received special irq: {:?}", irq),
263        }
264
265        // First try the fast path, where the destination is a single APIC we can send to directly.
266        if let Some(apic_id) = Apic::single_dest_fast(&irq.dest) {
267            self.send_irq_to_apic(apic_id as usize, &irq.data);
268            return;
269        }
270
271        let lowest_mode = irq.data.delivery == DeliveryMode::Lowest;
272        let mut lowest_priority = u8::MAX;
273        let mut lowest_apic: Option<usize> = None;
274
275        for (i, apic) in self.apics.iter().enumerate() {
276            let send = {
277                let apic = apic.lock();
278                if !apic.match_dest(&irq.dest) {
279                    false
280                } else if lowest_mode {
281                    let priority = apic.get_processor_priority();
282                    if priority <= lowest_priority {
283                        lowest_priority = priority;
284                        lowest_apic = Some(i);
285                    }
286                    false
287                } else {
288                    true
289                }
290            };
291            if send {
292                self.send_irq_to_apic(i, &irq.data);
293            }
294        }
295
296        if lowest_mode {
297            if let Some(index) = lowest_apic {
298                self.send_irq_to_apic(index, &irq.data);
299            } else {
300                // According to sections 10.6.2.1 and 10.6.2.2 of the SDM, the OS should not let
301                // this happen.  If the OS is misconfigured then drop the interrupt and log a
302                // warning.
303                warn!(
304                    "Lowest priority interrupt sent, but no apics configured as valid target: {:?}",
305                    irq
306                );
307            }
308        }
309    }
310
311    /// Delivers a startup IPI to `vcpu`.
312    fn deliver_startup(&self, vcpu: &V, vector: u8) -> Result<()> {
313        // This comes from Intel SDM volume 3, chapter 8.4.  The vector specifies a page aligned
314        // address where execution should start.  cs.base is the offset for the code segment with an
315        // RIP of 0.  The cs.selector is just the base shifted right by 4 bits.
316        let mut sregs = vcpu.get_sregs()?;
317        sregs.cs.base = (vector as u64) << 12;
318        sregs.cs.selector = (vector as u16) << 8;
319
320        // Set CR0 to its INIT value per the manual.  Application processors won't boot with the CR0
321        // protected mode and paging bits set by setup_sregs().  Kernel APIC doesn't have this
322        // issue, probably because it uses MSRs instead of MMIO, so it's less affected when the AP's
323        // state (CR3 etc.) doesn't reflect changes that Linux made while booting vcpu 0.
324        sregs.cr0 = X86_CR0_INIT;
325        vcpu.set_sregs(&sregs)?;
326
327        let mut regs = vcpu.get_regs()?;
328        regs.rip = 0;
329        vcpu.set_regs(&regs)?;
330
331        Ok(())
332    }
333
334    /// Checks if the specified VCPU is in a runnable state.
335    fn is_runnable(&self, vcpu_id: usize) -> bool {
336        self.apics[vcpu_id].lock().get_mp_state() == MPState::Runnable
337    }
338}
339
340impl Dropper {
341    fn sleep(&mut self) -> anyhow::Result<()> {
342        for thread in self.workers.split_off(0).into_iter() {
343            thread.stop();
344        }
345        Ok(())
346    }
347}
348
349impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> {
350    fn register_irq_event(
351        &mut self,
352        irq: u32,
353        irq_event: &Event,
354        resample_event: Option<&Event>,
355        source: IrqEventSource,
356    ) -> Result<Option<IrqEventIndex>> {
357        let mut evt = IrqEvent {
358            gsi: irq,
359            event: irq_event.try_clone()?,
360            resample_event: None,
361            source,
362        };
363        if let Some(resample_event) = resample_event {
364            evt.resample_event = Some(resample_event.try_clone()?);
365        }
366
367        let mut irq_events = self.irq_events.lock();
368        let index = irq_events.len();
369        irq_events.push(Some(evt));
370        Ok(Some(index))
371    }
372
373    fn unregister_irq_event(&mut self, irq: u32, irq_event: &Event) -> Result<()> {
374        let mut irq_events = self.irq_events.lock();
375        for (index, evt) in irq_events.iter().enumerate() {
376            if let Some(evt) = evt {
377                if evt.gsi == irq && irq_event.eq(&evt.event) {
378                    irq_events[index] = None;
379                    break;
380                }
381            }
382        }
383        Ok(())
384    }
385}
386
387impl<V: VcpuX86_64 + 'static> IrqChip for UserspaceIrqChip<V> {
388    fn add_vcpu(&mut self, vcpu_id: usize, vcpu: &dyn Vcpu) -> Result<()> {
389        let vcpu: &V = vcpu
390            .downcast_ref()
391            .expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type");
392        self.vcpus.lock()[vcpu_id] = Some(vcpu.try_clone()?);
393        Ok(())
394    }
395
396    fn register_edge_irq_event(
397        &mut self,
398        irq: u32,
399        irq_event: &IrqEdgeEvent,
400        source: IrqEventSource,
401    ) -> Result<Option<IrqEventIndex>> {
402        self.register_irq_event(irq, irq_event.get_trigger(), None, source)
403    }
404
405    fn unregister_edge_irq_event(&mut self, irq: u32, irq_event: &IrqEdgeEvent) -> Result<()> {
406        self.unregister_irq_event(irq, irq_event.get_trigger())
407    }
408
409    fn register_level_irq_event(
410        &mut self,
411        irq: u32,
412        irq_event: &IrqLevelEvent,
413        source: IrqEventSource,
414    ) -> Result<Option<IrqEventIndex>> {
415        self.register_irq_event(
416            irq,
417            irq_event.get_trigger(),
418            Some(irq_event.get_resample()),
419            source,
420        )
421    }
422
423    fn unregister_level_irq_event(&mut self, irq: u32, irq_event: &IrqLevelEvent) -> Result<()> {
424        self.unregister_irq_event(irq, irq_event.get_trigger())
425    }
426
427    fn route_irq(&mut self, route: IrqRoute) -> Result<()> {
428        self.routes.lock().add(route)
429    }
430
431    fn set_irq_routes(&mut self, routes: &[IrqRoute]) -> Result<()> {
432        self.routes.lock().replace_all(routes)
433    }
434
435    fn irq_event_tokens(&self) -> Result<Vec<(IrqEventIndex, IrqEventSource, Event)>> {
436        let mut tokens: Vec<(IrqEventIndex, IrqEventSource, Event)> = Vec::new();
437        for (index, evt) in self.irq_events.lock().iter().enumerate() {
438            if let Some(evt) = evt {
439                tokens.push((index, evt.source.clone(), evt.event.try_clone()?));
440            }
441        }
442        Ok(tokens)
443    }
444
445    fn service_irq(&mut self, irq: u32, level: bool) -> Result<()> {
446        for route in self.routes.lock()[irq as usize].iter() {
447            match *route {
448                IrqSource::Irqchip {
449                    chip: IrqSourceChip::PicPrimary,
450                    pin,
451                }
452                | IrqSource::Irqchip {
453                    chip: IrqSourceChip::PicSecondary,
454                    pin,
455                } => {
456                    self.pic.lock().service_irq(pin as u8, level);
457                }
458                IrqSource::Irqchip {
459                    chip: IrqSourceChip::Ioapic,
460                    pin,
461                } => {
462                    self.ioapic.lock().service_irq(pin as usize, level);
463                }
464                // service_irq's level parameter is ignored for MSIs.  MSI data specifies the level.
465                IrqSource::Msi { address, data } => {
466                    self.send_msi(address as u32, data);
467                }
468                _ => {
469                    error!("Unexpected route source {:?}", route);
470                    return Err(Error::new(libc::EINVAL));
471                }
472            }
473        }
474        Ok(())
475    }
476
477    /// Services an IRQ event by asserting then deasserting an IRQ line.  The associated Event
478    /// that triggered the irq event will be read from.  If the irq is associated with a resample
479    /// Event, then the deassert will only happen after an EOI is broadcast for a vector
480    /// associated with the irq line.
481    /// For UserspaceIrqChip, this function identifies the destination(s) of the irq: PIC, IOAPIC,
482    /// or APIC (MSI).  If it's a PIC or IOAPIC route, we attempt to call service_irq on those
483    /// chips.  If the IOAPIC is unable to be immediately locked, we add the irq to the
484    /// delayed_ioapic_irq_events (though we still read from the Event that triggered the irq
485    /// event).  If it's an MSI route, we call send_msi to decode the MSI and send it to the
486    /// destination APIC(s).
487    fn service_irq_event(&mut self, event_index: IrqEventIndex) -> Result<()> {
488        let irq_events = self.irq_events.lock();
489        let evt = if let Some(evt) = &irq_events[event_index] {
490            evt
491        } else {
492            return Ok(());
493        };
494        evt.event.wait()?;
495
496        for route in self.routes.lock()[evt.gsi as usize].iter() {
497            match *route {
498                IrqSource::Irqchip {
499                    chip: IrqSourceChip::PicPrimary,
500                    pin,
501                }
502                | IrqSource::Irqchip {
503                    chip: IrqSourceChip::PicSecondary,
504                    pin,
505                } => {
506                    let mut pic = self.pic.lock();
507                    if evt.resample_event.is_some() {
508                        pic.service_irq(pin as u8, true);
509                    } else {
510                        pic.service_irq(pin as u8, true);
511                        pic.service_irq(pin as u8, false);
512                    }
513                }
514                IrqSource::Irqchip {
515                    chip: IrqSourceChip::Ioapic,
516                    pin,
517                } => {
518                    if let Ok(mut ioapic) = self.ioapic.try_lock() {
519                        if evt.resample_event.is_some() {
520                            ioapic.service_irq(pin as usize, true);
521                        } else {
522                            ioapic.service_irq(pin as usize, true);
523                            ioapic.service_irq(pin as usize, false);
524                        }
525                    } else {
526                        let mut delayed_events = self.delayed_ioapic_irq_events.lock();
527                        delayed_events.events.push(event_index);
528                        delayed_events.trigger.signal().unwrap();
529                    }
530                }
531                IrqSource::Msi { address, data } => self.send_msi(address as u32, data),
532                _ => {
533                    error!("Unexpected route source {:?}", route);
534                    return Err(Error::new(libc::EINVAL));
535                }
536            }
537        }
538
539        Ok(())
540    }
541
542    /// Broadcasts an end of interrupt.  For UserspaceIrqChip this sends the EOI to the ioapic.
543    fn broadcast_eoi(&self, vector: u8) -> Result<()> {
544        self.ioapic.lock().end_of_interrupt(vector);
545        Ok(())
546    }
547
548    /// Injects any pending interrupts for `vcpu`.
549    ///
550    /// For UserspaceIrqChip this:
551    ///   * Injects a PIC interrupt, if vcpu_id is 0 and vcpu is ready for interrupt
552    ///   * Injects an APIC fixed interrupt, if vcpu is ready for interrupt and PIC didn't inject
553    ///   * Injects APIC NMIs
554    ///   * Handles APIC INIT IPIs
555    ///   * Handles APIC SIPIs
556    ///   * Requests an interrupt window, if PIC or APIC still has pending interrupts for this vcpu
557    fn inject_interrupts(&self, vcpu: &dyn Vcpu) -> Result<()> {
558        let vcpu: &V = vcpu
559            .downcast_ref()
560            .expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type");
561        let vcpu_id = vcpu.id();
562        let mut vcpu_ready = vcpu.ready_for_interrupt();
563
564        let mut pic_needs_window = false;
565        if vcpu_id == 0 {
566            let mut pic = self.pic.lock();
567            if vcpu_ready {
568                if let Some(vector) = pic.get_external_interrupt() {
569                    vcpu.interrupt(vector)?;
570                    self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
571                    // Already injected a PIC interrupt, so APIC fixed interrupt can't be injected.
572                    vcpu_ready = false;
573                }
574            }
575            pic_needs_window = pic.interrupt_requested();
576        }
577
578        let irqs = self.apics[vcpu_id].lock().get_pending_irqs(vcpu_ready);
579        if let Some(vector) = irqs.fixed {
580            let do_interrupt = {
581                let mut apic = self.apics[vcpu_id].lock();
582                match apic.get_mp_state() {
583                    MPState::Runnable | MPState::Halted => {
584                        // APIC interrupts should only be injectable when the MPState is
585                        // Halted or Runnable.
586                        apic.set_mp_state(&MPState::Runnable);
587                        true
588                    }
589                    s => {
590                        // This shouldn't happen, but log a helpful error if it does.
591                        error!("Interrupt cannot be injected while in state: {:?}", s);
592                        false
593                    }
594                }
595            };
596
597            if do_interrupt {
598                vcpu.interrupt(vector)?;
599            }
600        }
601        for _ in 0..irqs.nmis {
602            let prev_state = self.apics[vcpu_id].lock().get_mp_state();
603            vcpu.inject_nmi()?;
604            self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
605            info!(
606                "Delivered NMI to cpu {}, mp_state was {:?}, now is {:?}",
607                vcpu_id,
608                prev_state,
609                MPState::Runnable
610            );
611        }
612        if irqs.init {
613            {
614                let mut apic = self.apics[vcpu_id].lock();
615                apic.load_reset_state();
616                apic.set_mp_state(&MPState::InitReceived);
617            }
618            info!("Delivered INIT IPI to cpu {}", vcpu_id);
619        }
620        if let Some(vector) = irqs.startup {
621            // If our state is not MPState::InitReceived then this is probably
622            // the second SIPI in the INIT-SIPI-SIPI sequence; ignore.
623            if self.apics[vcpu_id].lock().get_mp_state() == MPState::InitReceived {
624                self.deliver_startup(vcpu, vector)?;
625                self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
626                info!("Delivered SIPI to cpu {}", vcpu_id);
627            }
628        }
629
630        let needs_window = pic_needs_window || irqs.needs_window;
631        vcpu.set_interrupt_window_requested(needs_window);
632
633        Ok(())
634    }
635
636    /// Notifies the irq chip that the specified VCPU has executed a halt instruction.
637    /// For `UserspaceIrqChip`, it sets the APIC's mp_state to `MPState::Halted`.
638    fn halted(&self, vcpu_id: usize) {
639        self.apics[vcpu_id].lock().set_mp_state(&MPState::Halted)
640    }
641
642    /// Blocks until `vcpu` is in a runnable state or until interrupted by
643    /// `IrqChip::kick_halted_vcpus`.  Returns `VcpuRunState::Runnable if vcpu is runnable, or
644    /// `VcpuRunState::Interrupted` if the wait was interrupted.
645    /// For `UserspaceIrqChip`, if the APIC isn't `MPState::Runnable`, sleep until there are new
646    /// interrupts pending on the APIC, inject the interrupts, and go back to sleep if still not
647    /// runnable.
648    fn wait_until_runnable(&self, vcpu: &dyn Vcpu) -> Result<VcpuRunState> {
649        let vcpu_id = vcpu.id();
650        let waiter = &self.waiters[vcpu_id];
651        let mut interrupted_lock = waiter.mtx.lock();
652        loop {
653            if *interrupted_lock {
654                *interrupted_lock = false;
655                info!("wait_until_runnable interrupted on cpu {}", vcpu_id);
656                return Ok(VcpuRunState::Interrupted);
657            }
658            if self.is_runnable(vcpu_id) {
659                return Ok(VcpuRunState::Runnable);
660            }
661
662            self.inject_interrupts(vcpu)?;
663            if self.is_runnable(vcpu_id) {
664                return Ok(VcpuRunState::Runnable);
665            }
666            interrupted_lock = waiter.cvar.wait(interrupted_lock);
667        }
668    }
669
670    /// Makes unrunnable VCPUs return immediately from `wait_until_runnable`.
671    /// For UserspaceIrqChip, every vcpu gets kicked so its current or next call to
672    /// `wait_until_runnable` will immediately return false.  After that one kick, subsequent
673    /// `wait_until_runnable` calls go back to waiting for runnability normally.
674    fn kick_halted_vcpus(&self) {
675        for waiter in self.waiters.iter() {
676            waiter.set_and_notify(/* interrupted= */ true);
677        }
678    }
679
680    fn get_mp_state(&self, vcpu_id: usize) -> Result<MPState> {
681        Ok(self.apics[vcpu_id].lock().get_mp_state())
682    }
683
684    fn set_mp_state(&mut self, vcpu_id: usize, state: &MPState) -> Result<()> {
685        self.apics[vcpu_id].lock().set_mp_state(state);
686        Ok(())
687    }
688
689    fn try_clone(&self) -> Result<Self> {
690        // kill_evts and timer_descriptors don't change, so they could be a plain Vec with each
691        // element cloned.  But the Arc<Mutex> avoids a quadratic number of open descriptors from
692        // cloning, and those fields aren't performance critical.
693        Ok(UserspaceIrqChip {
694            vcpus: self.vcpus.clone(),
695            waiters: self.waiters.clone(),
696            routes: self.routes.clone(),
697            pit: self.pit.clone(),
698            pic: self.pic.clone(),
699            ioapic: self.ioapic.clone(),
700            ioapic_pins: self.ioapic_pins,
701            apics: self.apics.clone(),
702            timer_descriptors: self.timer_descriptors.clone(),
703            delayed_ioapic_irq_events: self.delayed_ioapic_irq_events.clone(),
704            irq_events: self.irq_events.clone(),
705            dropper: self.dropper.clone(),
706            activated: self.activated,
707        })
708    }
709
710    // TODO(srichman): factor out UserspaceIrqChip and KvmSplitIrqChip::finalize_devices
711    fn finalize_devices(
712        &mut self,
713        resources: &mut SystemAllocator,
714        io_bus: &Bus,
715        mmio_bus: &Bus,
716    ) -> Result<()> {
717        // Insert pit into io_bus
718        io_bus.insert(self.pit.clone(), 0x040, 0x8).unwrap();
719        io_bus.insert(self.pit.clone(), 0x061, 0x1).unwrap();
720
721        // Insert pic into io_bus
722        io_bus.insert(self.pic.clone(), 0x20, 0x2).unwrap();
723        io_bus.insert(self.pic.clone(), 0xa0, 0x2).unwrap();
724        io_bus.insert(self.pic.clone(), 0x4d0, 0x2).unwrap();
725
726        // Insert ioapic into mmio_bus
727        mmio_bus
728            .insert(
729                self.ioapic.clone(),
730                IOAPIC_BASE_ADDRESS,
731                IOAPIC_MEM_LENGTH_BYTES,
732            )
733            .unwrap();
734
735        // Insert self into mmio_bus for handling APIC mmio
736        mmio_bus
737            .insert_sync(
738                Arc::new(self.try_clone()?),
739                APIC_BASE_ADDRESS,
740                APIC_MEM_LENGTH_BYTES,
741            )
742            .unwrap();
743
744        // At this point, all of our devices have been created and they have registered their
745        // irq events, so we can clone our resample events
746        let mut ioapic_resample_events: Vec<Vec<Event>> =
747            (0..self.ioapic_pins).map(|_| Vec::new()).collect();
748        let mut pic_resample_events: Vec<Vec<Event>> =
749            (0..self.ioapic_pins).map(|_| Vec::new()).collect();
750
751        for evt in self.irq_events.lock().iter().flatten() {
752            if (evt.gsi as usize) >= self.ioapic_pins {
753                continue;
754            }
755            if let Some(resample_evt) = &evt.resample_event {
756                ioapic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?);
757                pic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?);
758            }
759        }
760
761        // Register resample events with the ioapic
762        self.ioapic
763            .lock()
764            .register_resample_events(ioapic_resample_events);
765        // Register resample events with the pic
766        self.pic
767            .lock()
768            .register_resample_events(pic_resample_events);
769
770        // Make sure all future irq numbers are >= self.ioapic_pins
771        let mut irq_num = resources.allocate_irq().unwrap();
772        while irq_num < self.ioapic_pins as u32 {
773            irq_num = resources.allocate_irq().unwrap();
774        }
775
776        // Spawn timer threads here instead of in new(), in case crosvm is in sandbox mode.
777        self.activated = true;
778        let _ = self.wake();
779
780        Ok(())
781    }
782
783    /// The UserspaceIrqChip's ioapic may be locked because a vcpu thread is currently writing to
784    /// the ioapic, and the ioapic may be blocking on adding MSI routes, which requires blocking
785    /// tube communication back to the main thread.  Thus, we do not want the main thread to
786    /// block on a locked ioapic, so any irqs that could not be serviced because the ioapic could
787    /// not be immediately locked are added to the delayed_ioapic_irq_events Vec. This function
788    /// processes each delayed event in the vec each time it's called. If the ioapic is still
789    /// locked, we keep the queued irqs for the next time this function is called.
790    fn process_delayed_irq_events(&mut self) -> Result<()> {
791        let irq_events = self.irq_events.lock();
792        let mut delayed_events = self.delayed_ioapic_irq_events.lock();
793        delayed_events.events.retain(|&event_index| {
794            if let Some(evt) = &irq_events[event_index] {
795                if let Ok(mut ioapic) = self.ioapic.try_lock() {
796                    if evt.resample_event.is_some() {
797                        ioapic.service_irq(evt.gsi as usize, true);
798                    } else {
799                        ioapic.service_irq(evt.gsi as usize, true);
800                        ioapic.service_irq(evt.gsi as usize, false);
801                    }
802
803                    false
804                } else {
805                    true
806                }
807            } else {
808                true
809            }
810        });
811
812        if delayed_events.events.is_empty() {
813            delayed_events.trigger.wait()?;
814        }
815        Ok(())
816    }
817
818    fn irq_delayed_event_token(&self) -> Result<Option<Event>> {
819        Ok(Some(
820            self.delayed_ioapic_irq_events.lock().trigger.try_clone()?,
821        ))
822    }
823
824    fn check_capability(&self, c: IrqChipCap) -> bool {
825        match c {
826            IrqChipCap::TscDeadlineTimer => false,
827            IrqChipCap::X2Apic => false,
828            IrqChipCap::MpStateGetSet => true,
829        }
830    }
831}
832
833impl<V: VcpuX86_64 + 'static> BusDevice for UserspaceIrqChip<V> {
834    fn debug_label(&self) -> String {
835        "UserspaceIrqChip APIC".to_string()
836    }
837    fn device_id(&self) -> DeviceId {
838        CrosvmDeviceId::UserspaceIrqChip.into()
839    }
840}
841
842impl<V: VcpuX86_64 + 'static> Suspendable for UserspaceIrqChip<V> {
843    fn sleep(&mut self) -> anyhow::Result<()> {
844        let mut dropper = self.dropper.lock();
845        dropper.sleep()
846    }
847
848    fn wake(&mut self) -> anyhow::Result<()> {
849        if self.activated {
850            // create workers and run them.
851            let mut dropper = self.dropper.lock();
852            for (i, descriptor) in self.timer_descriptors.iter().enumerate() {
853                let mut worker = TimerWorker {
854                    id: i,
855                    apic: self.apics[i].clone(),
856                    descriptor: *descriptor,
857                    vcpus: self.vcpus.clone(),
858                    waiter: self.waiters[i].clone(),
859                };
860                let worker_thread =
861                    WorkerThread::start(format!("UserspaceIrqChip timer worker {i}"), move |evt| {
862                        if let Err(e) = worker.run(evt) {
863                            error!("UserspaceIrqChip worker failed: {e:#}");
864                        }
865                    });
866                dropper.workers.push(worker_thread);
867            }
868        }
869        Ok(())
870    }
871}
872
873impl<V: VcpuX86_64 + 'static> BusDeviceSync for UserspaceIrqChip<V> {
874    fn read(&self, info: BusAccessInfo, data: &mut [u8]) {
875        self.apics[info.id].lock().read(info.offset, data)
876    }
877    fn write(&self, info: BusAccessInfo, data: &[u8]) {
878        let msg = self.apics[info.id].lock().write(info.offset, data);
879        if let Some(m) = msg {
880            self.handle_msg(m);
881        }
882    }
883}
884
885impl<V: VcpuX86_64 + 'static> IrqChipX86_64 for UserspaceIrqChip<V> {
886    fn try_box_clone(&self) -> Result<Box<dyn IrqChipX86_64>> {
887        Ok(Box::new(self.try_clone()?))
888    }
889
890    fn as_irq_chip(&self) -> &dyn IrqChip {
891        self
892    }
893
894    fn as_irq_chip_mut(&mut self) -> &mut dyn IrqChip {
895        self
896    }
897
898    fn get_pic_state(&self, select: PicSelect) -> Result<PicState> {
899        Ok(self.pic.lock().get_pic_state(select))
900    }
901
902    fn set_pic_state(&mut self, select: PicSelect, state: &PicState) -> Result<()> {
903        self.pic.lock().set_pic_state(select, state);
904        Ok(())
905    }
906
907    fn get_ioapic_state(&self) -> Result<IoapicState> {
908        Ok(self.ioapic.lock().get_ioapic_state())
909    }
910
911    fn set_ioapic_state(&mut self, state: &IoapicState) -> Result<()> {
912        self.ioapic.lock().set_ioapic_state(state);
913        Ok(())
914    }
915
916    fn get_lapic_state(&self, vcpu_id: usize) -> Result<LapicState> {
917        Ok(self.apics[vcpu_id].lock().get_state())
918    }
919
920    fn set_lapic_state(&mut self, vcpu_id: usize, state: &LapicState) -> Result<()> {
921        self.apics[vcpu_id].lock().set_state(state);
922        Ok(())
923    }
924
925    /// Get the lapic frequency in Hz
926    fn lapic_frequency(&self) -> u32 {
927        Apic::frequency()
928    }
929
930    fn get_pit(&self) -> Result<PitState> {
931        Ok(self.pit.lock().get_pit_state())
932    }
933
934    fn set_pit(&mut self, state: &PitState) -> Result<()> {
935        self.pit.lock().set_pit_state(state);
936        Ok(())
937    }
938
939    /// Returns true if the PIT uses port 0x61 for the PC speaker, false if 0x61 is unused.
940    /// devices::Pit uses 0x61.
941    fn pit_uses_speaker_port(&self) -> bool {
942        true
943    }
944
945    fn snapshot_chip_specific(&self) -> anyhow::Result<AnySnapshot> {
946        Err(anyhow::anyhow!("Not supported yet in userspace"))
947    }
948    fn restore_chip_specific(&mut self, _data: AnySnapshot) -> anyhow::Result<()> {
949        Err(anyhow::anyhow!("Not supported yet in userspace"))
950    }
951}
952
953/// Condition variable used by `UserspaceIrqChip::wait_until_runnable`.
954#[derive(Default)]
955struct Waiter {
956    // mtx stores an "interrupted" bool that's true if `kick_halted_vcpus` has been called.
957    mtx: Mutex<bool>,
958    cvar: Condvar,
959}
960
961impl Waiter {
962    /// Wakes up `wait_until_runnable` to recheck the interrupted flag and vcpu runnable state.
963    pub fn notify(&self) {
964        let _lock = self.mtx.lock();
965        self.cvar.notify_all();
966    }
967
968    /// Sets the interrupted flag, and wakes up `wait_until_runnable` to recheck the interrupted
969    /// flag and vcpu runnable state.  If `interrupted` is true, then `wait_until_runnable` should
970    /// stop waiting for a runnable vcpu and return immediately.
971    pub fn set_and_notify(&self, interrupted: bool) {
972        let mut interrupted_lock = self.mtx.lock();
973        *interrupted_lock = interrupted;
974        self.cvar.notify_all();
975    }
976}
977
978/// Worker thread for polling timer events and sending them to an APIC.
979struct TimerWorker<V: VcpuX86_64> {
980    id: usize,
981    apic: Arc<Mutex<Apic>>,
982    vcpus: Arc<Mutex<Vec<Option<V>>>>,
983    descriptor: Descriptor,
984    waiter: Arc<Waiter>,
985}
986
987impl<V: VcpuX86_64> TimerWorker<V> {
988    fn run(&mut self, kill_evt: Event) -> TimerWorkerResult<()> {
989        #[derive(EventToken)]
990        enum Token {
991            // The timer expired.
992            TimerExpire,
993            // The parent thread requested an exit.
994            Kill,
995        }
996
997        let wait_ctx: WaitContext<Token> = WaitContext::build_with(&[
998            (&self.descriptor, Token::TimerExpire),
999            (&kill_evt, Token::Kill),
1000        ])
1001        .map_err(TimerWorkerError::CreateWaitContext)?;
1002
1003        loop {
1004            let events = wait_ctx.wait().map_err(TimerWorkerError::WaitError)?;
1005            for event in events.iter().filter(|e| e.is_readable) {
1006                match event.token {
1007                    Token::TimerExpire => {
1008                        self.apic.lock().handle_timer_expiration();
1009                        if let Some(Some(vcpu)) = self.vcpus.lock().get(self.id) {
1010                            vcpu.set_interrupt_window_requested(true);
1011                        }
1012                        self.waiter.notify();
1013                    }
1014                    Token::Kill => return Ok(()),
1015                }
1016            }
1017        }
1018    }
1019}
1020
1021#[derive(Debug)]
1022enum TimerWorkerError {
1023    /// Creating WaitContext failed.
1024    CreateWaitContext(Error),
1025    /// Error while waiting for events.
1026    WaitError(Error),
1027}
1028
1029impl Display for TimerWorkerError {
1030    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1031        use self::TimerWorkerError::*;
1032
1033        match self {
1034            CreateWaitContext(e) => write!(f, "failed to create event context: {e}"),
1035            WaitError(e) => write!(f, "failed to wait for events: {e}"),
1036        }
1037    }
1038}
1039
1040impl std::error::Error for TimerWorkerError {}
1041
1042type TimerWorkerResult<T> = std::result::Result<T, TimerWorkerError>;