devices/irqchip/
userspace.rs

1// Copyright 2020 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::convert::TryFrom;
6use std::convert::TryInto;
7use std::fmt;
8use std::fmt::Display;
9use std::iter;
10use std::sync::atomic::AtomicBool;
11use std::sync::atomic::Ordering;
12use std::sync::Arc;
13
14cfg_if::cfg_if! {
15    if #[cfg(test)] {
16        use base::{FakeClock as Clock, FakeTimer as Timer};
17    } else {
18        use base::{Clock, Timer};
19    }
20}
21use base::error;
22use base::info;
23use base::warn;
24use base::AsRawDescriptor;
25use base::Descriptor;
26use base::Error;
27use base::Event;
28use base::EventToken;
29use base::Result;
30use base::Tube;
31use base::WaitContext;
32use base::WorkerThread;
33use hypervisor::DeliveryMode;
34use hypervisor::IoapicState;
35use hypervisor::IrqRoute;
36use hypervisor::IrqSource;
37use hypervisor::IrqSourceChip;
38use hypervisor::LapicState;
39use hypervisor::MPState;
40use hypervisor::MsiAddressMessage;
41use hypervisor::MsiDataMessage;
42use hypervisor::PicSelect;
43use hypervisor::PicState;
44use hypervisor::PitState;
45use hypervisor::VcpuArch;
46use hypervisor::VcpuX86_64;
47use resources::SystemAllocator;
48use snapshot::AnySnapshot;
49use sync::Condvar;
50use sync::Mutex;
51use vm_control::DeviceId;
52use vm_control::PlatformDeviceId;
53
54use crate::bus::BusDeviceSync;
55use crate::irqchip::Apic;
56use crate::irqchip::ApicBusMsg;
57use crate::irqchip::DelayedIoApicIrqEvents;
58use crate::irqchip::Interrupt;
59use crate::irqchip::InterruptData;
60use crate::irqchip::InterruptDestination;
61use crate::irqchip::Ioapic;
62use crate::irqchip::IrqEvent;
63use crate::irqchip::IrqEventIndex;
64use crate::irqchip::Pic;
65use crate::irqchip::Routes;
66use crate::irqchip::VcpuRunState;
67use crate::irqchip::APIC_BASE_ADDRESS;
68use crate::irqchip::APIC_MEM_LENGTH_BYTES;
69use crate::irqchip::IOAPIC_BASE_ADDRESS;
70use crate::irqchip::IOAPIC_MEM_LENGTH_BYTES;
71use crate::Bus;
72use crate::BusAccessInfo;
73use crate::BusDevice;
74use crate::IrqChip;
75use crate::IrqChipCap;
76use crate::IrqChipX86_64;
77use crate::IrqEdgeEvent;
78use crate::IrqEventSource;
79use crate::IrqLevelEvent;
80use crate::Pit;
81use crate::PitError;
82use crate::Suspendable;
83
84/// PIT channel 0 timer is connected to IRQ 0
85const PIT_CHANNEL0_IRQ: u32 = 0;
86/// CR0 extension type bit
87const X86_CR0_ET: u64 = 0x00000010;
88/// CR0 not write through bit
89const X86_CR0_NW: u64 = 0x20000000;
90/// CR0 cache disable bit
91const X86_CR0_CD: u64 = 0x40000000;
92/// Default power on state of CR0 register, according to the Intel manual.
93const X86_CR0_INIT: u64 = X86_CR0_ET | X86_CR0_NW | X86_CR0_CD;
94
95/// An `IrqChip` with all interrupt devices emulated in userspace.  `UserspaceIrqChip` works with
96/// any hypervisor, but only supports x86.
97pub struct UserspaceIrqChip {
98    pub vcpus: Arc<Mutex<Vec<Option<Arc<dyn VcpuX86_64>>>>>,
99    routes: Mutex<Routes>,
100    pit: Arc<Mutex<Pit>>,
101    pic: Arc<Mutex<Pic>>,
102    ioapic: Arc<Mutex<Ioapic>>,
103    ioapic_pins: usize,
104    pub apics: Vec<Arc<Mutex<Apic>>>,
105    // Condition variables used by wait_until_runnable.
106    waiters: Vec<Arc<Waiter>>,
107    // Raw descriptors of the apic Timers.
108    timer_descriptors: Vec<Descriptor>,
109    /// Delayed ioapic irq object, that contains the delayed events because the ioapic was locked
110    /// when service_irq was called on the irqchip. This prevents deadlocks when a Vcpu thread has
111    /// locked the ioapic and the ioapic sends a AddMsiRoute signal to the main thread (which
112    /// itself may be busy trying to call service_irq).
113    ///
114    /// ## Note:
115    /// This lock may be locked by itself to access the `DelayedIoApicIrqEvents`. If accessed in
116    /// conjunction with the `irq_events` field, that lock should be taken first to prevent
117    /// deadlocks stemming from lock-ordering issues.
118    delayed_ioapic_irq_events: Mutex<DelayedIoApicIrqEvents>,
119    // Array of Events that devices will use to assert ioapic pins.
120    irq_events: Mutex<Vec<Option<IrqEvent>>>,
121    /// Worker threads that deliver timer events to the APICs.
122    workers: Mutex<Vec<WorkerThread<()>>>,
123    activated: AtomicBool,
124}
125
126impl UserspaceIrqChip {
127    /// Constructs a new `UserspaceIrqChip`.
128    pub fn new(num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>) -> Result<Self> {
129        let clock = Arc::new(Mutex::new(Clock::new()));
130        Self::new_with_clock(num_vcpus, irq_tube, ioapic_pins, clock)
131    }
132
133    /// Constructs a new `UserspaceIrqChip`, with a clock.  Used for testing.
134    pub fn new_with_clock(
135        num_vcpus: usize,
136        irq_tube: Tube,
137        ioapic_pins: Option<usize>,
138        clock: Arc<Mutex<Clock>>,
139    ) -> Result<Self> {
140        let pit_evt = IrqEdgeEvent::new()?;
141        // For test only, this clock instance is FakeClock. It needs to be cloned for every Timer
142        // instance, so make a clone for it now.
143        #[cfg(test)]
144        let test_clock = clock.clone();
145        let pit = Pit::new(pit_evt.try_clone()?, clock).map_err(|e| match e {
146            PitError::CloneEvent(err) => err,
147            PitError::CreateEvent(err) => err,
148            PitError::CreateWaitContext(err) => err,
149            PitError::TimerCreateError(err) => err,
150            PitError::WaitError(err) => err,
151            PitError::SpawnThread(_) => Error::new(libc::EIO),
152        })?;
153        let pit_event_source = IrqEventSource::from_device(&pit);
154
155        let ioapic_pins = ioapic_pins.unwrap_or(hypervisor::NUM_IOAPIC_PINS);
156        let ioapic = Ioapic::new(irq_tube, ioapic_pins)?;
157
158        let mut timer_descriptors: Vec<Descriptor> = Vec::with_capacity(num_vcpus);
159        let mut apics: Vec<Arc<Mutex<Apic>>> = Vec::with_capacity(num_vcpus);
160        for id in 0..num_vcpus {
161            cfg_if::cfg_if! {
162                if #[cfg(test)] {
163                    let timer = Timer::new(test_clock.clone());
164                } else {
165                    let timer = Timer::new()?;
166                }
167            }
168            // Timers are owned by the apics, which outlive the raw descriptors stored here and in
169            // the worker threads.
170            timer_descriptors.push(Descriptor(timer.as_raw_descriptor()));
171
172            let id: u8 = id.try_into().or(Err(Error::new(libc::EINVAL)))?;
173            let apic = Apic::new(id, Box::new(timer));
174            apics.push(Arc::new(Mutex::new(apic)));
175        }
176
177        let chip = UserspaceIrqChip {
178            vcpus: Arc::new(Mutex::new(
179                iter::repeat_with(|| None).take(num_vcpus).collect(),
180            )),
181            waiters: iter::repeat_with(Default::default)
182                .take(num_vcpus)
183                .collect(),
184            routes: Mutex::new(Routes::new()),
185            pit: Arc::new(Mutex::new(pit)),
186            pic: Arc::new(Mutex::new(Pic::new())),
187            ioapic: Arc::new(Mutex::new(ioapic)),
188            ioapic_pins,
189            apics,
190            timer_descriptors,
191            delayed_ioapic_irq_events: Mutex::new(DelayedIoApicIrqEvents::new()?),
192            irq_events: Mutex::new(Vec::new()),
193            workers: Mutex::new(Vec::new()),
194            activated: AtomicBool::new(false),
195        };
196
197        // Setup standard x86 irq routes
198        chip.set_irq_routes(&Routes::default_pic_ioapic_routes(ioapic_pins))?;
199
200        chip.register_edge_irq_event(PIT_CHANNEL0_IRQ, &pit_evt, pit_event_source)?;
201        Ok(chip)
202    }
203
204    /// Handles a message from an APIC.
205    fn handle_msg(&self, msg: ApicBusMsg) {
206        match msg {
207            ApicBusMsg::Eoi(vector) => {
208                let _ = self.broadcast_eoi(vector);
209            }
210            ApicBusMsg::Ipi(interrupt) => self.send_irq_to_apics(&interrupt),
211        }
212    }
213
214    /// Sends a Message Signaled Interrupt to one or more APICs.  MSIs are a 64-bit address and
215    /// 32-bit data, but in the Intel spec we're implementing, only the low 32 bits of the address
216    /// are used.
217    fn send_msi(&self, addr: u32, data: u32) {
218        let mut msi_addr = MsiAddressMessage::new();
219        msi_addr.set(0, 32, addr as u64);
220        let dest = match InterruptDestination::try_from(&msi_addr) {
221            Ok(dest) => dest,
222            Err(e) => {
223                warn!("Invalid MSI message: {}", e);
224                return;
225            }
226        };
227
228        let mut msi_data = MsiDataMessage::new();
229        msi_data.set(0, 32, data as u64);
230        let data = InterruptData::from(&msi_data);
231
232        self.send_irq_to_apics(&Interrupt { dest, data });
233    }
234
235    pub fn send_irq_to_apic(&self, id: usize, irq: &InterruptData) {
236        // id can come from the guest, so check bounds.
237        if let Some(apic) = self.apics.get(id) {
238            apic.lock().accept_irq(irq);
239        } else {
240            error!("Interrupt for non-existent apic {}: {:?}", id, irq);
241        }
242        if let Some(Some(vcpu)) = self.vcpus.lock().get(id) {
243            vcpu.set_interrupt_window_requested(true);
244        } else {
245            error!("Interrupt for non-existent vcpu {}: {:?}", id, irq);
246        }
247        self.waiters[id].notify();
248    }
249
250    /// Sends an interrupt to one or more APICs.  Used for sending MSIs and IPIs.
251    pub fn send_irq_to_apics(&self, irq: &Interrupt) {
252        match irq.data.delivery {
253            DeliveryMode::Fixed | DeliveryMode::Lowest | DeliveryMode::RemoteRead => {}
254            _ => info!("UserspaceIrqChip received special irq: {:?}", irq),
255        }
256
257        // First try the fast path, where the destination is a single APIC we can send to directly.
258        if let Some(apic_id) = Apic::single_dest_fast(&irq.dest) {
259            self.send_irq_to_apic(apic_id as usize, &irq.data);
260            return;
261        }
262
263        let lowest_mode = irq.data.delivery == DeliveryMode::Lowest;
264        let mut lowest_priority = u8::MAX;
265        let mut lowest_apic: Option<usize> = None;
266
267        for (i, apic) in self.apics.iter().enumerate() {
268            let send = {
269                let apic = apic.lock();
270                if !apic.match_dest(&irq.dest) {
271                    false
272                } else if lowest_mode {
273                    let priority = apic.get_processor_priority();
274                    if priority <= lowest_priority {
275                        lowest_priority = priority;
276                        lowest_apic = Some(i);
277                    }
278                    false
279                } else {
280                    true
281                }
282            };
283            if send {
284                self.send_irq_to_apic(i, &irq.data);
285            }
286        }
287
288        if lowest_mode {
289            if let Some(index) = lowest_apic {
290                self.send_irq_to_apic(index, &irq.data);
291            } else {
292                // According to sections 10.6.2.1 and 10.6.2.2 of the SDM, the OS should not let
293                // this happen.  If the OS is misconfigured then drop the interrupt and log a
294                // warning.
295                warn!(
296                    "Lowest priority interrupt sent, but no apics configured as valid target: {:?}",
297                    irq
298                );
299            }
300        }
301    }
302
303    /// Delivers a startup IPI to `vcpu`.
304    fn deliver_startup(&self, vcpu: &dyn VcpuX86_64, vector: u8) -> Result<()> {
305        // This comes from Intel SDM volume 3, chapter 8.4.  The vector specifies a page aligned
306        // address where execution should start.  cs.base is the offset for the code segment with an
307        // RIP of 0.  The cs.selector is just the base shifted right by 4 bits.
308        let mut sregs = vcpu.get_sregs()?;
309        sregs.cs.base = (vector as u64) << 12;
310        sregs.cs.selector = (vector as u16) << 8;
311
312        // Set CR0 to its INIT value per the manual.  Application processors won't boot with the CR0
313        // protected mode and paging bits set by setup_sregs().  Kernel APIC doesn't have this
314        // issue, probably because it uses MSRs instead of MMIO, so it's less affected when the AP's
315        // state (CR3 etc.) doesn't reflect changes that Linux made while booting vcpu 0.
316        sregs.cr0 = X86_CR0_INIT;
317        vcpu.set_sregs(&sregs)?;
318
319        let mut regs = vcpu.get_regs()?;
320        regs.rip = 0;
321        vcpu.set_regs(&regs)?;
322
323        Ok(())
324    }
325
326    /// Checks if the specified VCPU is in a runnable state.
327    fn is_runnable(&self, vcpu_id: usize) -> bool {
328        self.apics[vcpu_id].lock().get_mp_state() == MPState::Runnable
329    }
330}
331
332impl UserspaceIrqChip {
333    pub fn wake_internal(&self) -> anyhow::Result<()> {
334        if self.activated.load(Ordering::Relaxed) {
335            // create workers and run them.
336            for (i, descriptor) in self.timer_descriptors.iter().enumerate() {
337                let mut worker = TimerWorker {
338                    id: i,
339                    apic: self.apics[i].clone(),
340                    descriptor: *descriptor,
341                    vcpus: self.vcpus.clone(),
342                    waiter: self.waiters[i].clone(),
343                };
344                let worker_thread =
345                    WorkerThread::start(format!("UserspaceIrqChip timer worker {i}"), move |evt| {
346                        if let Err(e) = worker.run(evt) {
347                            error!("UserspaceIrqChip worker failed: {e:#}");
348                        }
349                    });
350                self.workers.lock().push(worker_thread);
351            }
352        }
353        Ok(())
354    }
355
356    fn register_irq_event(
357        &self,
358        irq: u32,
359        irq_event: &Event,
360        resample_event: Option<&Event>,
361        source: IrqEventSource,
362    ) -> Result<Option<IrqEventIndex>> {
363        let mut evt = IrqEvent {
364            gsi: irq,
365            event: irq_event.try_clone()?,
366            resample_event: None,
367            source,
368        };
369        if let Some(resample_event) = resample_event {
370            evt.resample_event = Some(resample_event.try_clone()?);
371        }
372
373        let mut irq_events = self.irq_events.lock();
374        let index = irq_events.len();
375        irq_events.push(Some(evt));
376        Ok(Some(index))
377    }
378
379    fn unregister_irq_event(&self, irq: u32, irq_event: &Event) -> Result<()> {
380        let mut irq_events = self.irq_events.lock();
381        for (index, evt) in irq_events.iter().enumerate() {
382            if let Some(evt) = evt {
383                if evt.gsi == irq && irq_event.eq(&evt.event) {
384                    irq_events[index] = None;
385                    break;
386                }
387            }
388        }
389        Ok(())
390    }
391}
392
393impl IrqChip for UserspaceIrqChip {
394    fn add_vcpu(&self, vcpu_id: usize, vcpu: Arc<dyn VcpuArch>) -> Result<()> {
395        self.vcpus.lock()[vcpu_id] = Some(vcpu);
396        Ok(())
397    }
398
399    fn register_edge_irq_event(
400        &self,
401        irq: u32,
402        irq_event: &IrqEdgeEvent,
403        source: IrqEventSource,
404    ) -> Result<Option<IrqEventIndex>> {
405        self.register_irq_event(irq, irq_event.get_trigger(), None, source)
406    }
407
408    fn unregister_edge_irq_event(&self, irq: u32, irq_event: &IrqEdgeEvent) -> Result<()> {
409        self.unregister_irq_event(irq, irq_event.get_trigger())
410    }
411
412    fn register_level_irq_event(
413        &self,
414        irq: u32,
415        irq_event: &IrqLevelEvent,
416        source: IrqEventSource,
417    ) -> Result<Option<IrqEventIndex>> {
418        self.register_irq_event(
419            irq,
420            irq_event.get_trigger(),
421            Some(irq_event.get_resample()),
422            source,
423        )
424    }
425
426    fn unregister_level_irq_event(&self, irq: u32, irq_event: &IrqLevelEvent) -> Result<()> {
427        self.unregister_irq_event(irq, irq_event.get_trigger())
428    }
429
430    fn route_irq(&self, route: IrqRoute) -> Result<()> {
431        self.routes.lock().add(route)
432    }
433
434    fn set_irq_routes(&self, routes: &[IrqRoute]) -> Result<()> {
435        self.routes.lock().replace_all(routes)
436    }
437
438    fn irq_event_tokens(&self) -> Result<Vec<(IrqEventIndex, IrqEventSource, Event)>> {
439        let mut tokens: Vec<(IrqEventIndex, IrqEventSource, Event)> = Vec::new();
440        for (index, evt) in self.irq_events.lock().iter().enumerate() {
441            if let Some(evt) = evt {
442                tokens.push((index, evt.source.clone(), evt.event.try_clone()?));
443            }
444        }
445        Ok(tokens)
446    }
447
448    fn service_irq(&self, irq: u32, level: bool) -> Result<()> {
449        for route in self.routes.lock()[irq as usize].iter() {
450            match *route {
451                IrqSource::Irqchip {
452                    chip: IrqSourceChip::PicPrimary,
453                    pin,
454                }
455                | IrqSource::Irqchip {
456                    chip: IrqSourceChip::PicSecondary,
457                    pin,
458                } => {
459                    self.pic.lock().service_irq(pin as u8, level);
460                }
461                IrqSource::Irqchip {
462                    chip: IrqSourceChip::Ioapic,
463                    pin,
464                } => {
465                    self.ioapic.lock().service_irq(pin as usize, level);
466                }
467                // service_irq's level parameter is ignored for MSIs.  MSI data specifies the level.
468                IrqSource::Msi { address, data } => {
469                    self.send_msi(address as u32, data);
470                }
471                _ => {
472                    error!("Unexpected route source {:?}", route);
473                    return Err(Error::new(libc::EINVAL));
474                }
475            }
476        }
477        Ok(())
478    }
479
480    /// Services an IRQ event by asserting then deasserting an IRQ line.  The associated Event
481    /// that triggered the irq event will be read from.  If the irq is associated with a resample
482    /// Event, then the deassert will only happen after an EOI is broadcast for a vector
483    /// associated with the irq line.
484    /// For UserspaceIrqChip, this function identifies the destination(s) of the irq: PIC, IOAPIC,
485    /// or APIC (MSI).  If it's a PIC or IOAPIC route, we attempt to call service_irq on those
486    /// chips.  If the IOAPIC is unable to be immediately locked, we add the irq to the
487    /// delayed_ioapic_irq_events (though we still read from the Event that triggered the irq
488    /// event).  If it's an MSI route, we call send_msi to decode the MSI and send it to the
489    /// destination APIC(s).
490    fn service_irq_event(&self, event_index: IrqEventIndex) -> Result<()> {
491        let irq_events = self.irq_events.lock();
492        let evt = if let Some(evt) = &irq_events[event_index] {
493            evt
494        } else {
495            return Ok(());
496        };
497        evt.event.wait()?;
498
499        for route in self.routes.lock()[evt.gsi as usize].iter() {
500            match *route {
501                IrqSource::Irqchip {
502                    chip: IrqSourceChip::PicPrimary,
503                    pin,
504                }
505                | IrqSource::Irqchip {
506                    chip: IrqSourceChip::PicSecondary,
507                    pin,
508                } => {
509                    let mut pic = self.pic.lock();
510                    if evt.resample_event.is_some() {
511                        pic.service_irq(pin as u8, true);
512                    } else {
513                        pic.service_irq(pin as u8, true);
514                        pic.service_irq(pin as u8, false);
515                    }
516                }
517                IrqSource::Irqchip {
518                    chip: IrqSourceChip::Ioapic,
519                    pin,
520                } => {
521                    if let Ok(mut ioapic) = self.ioapic.try_lock() {
522                        if evt.resample_event.is_some() {
523                            ioapic.service_irq(pin as usize, true);
524                        } else {
525                            ioapic.service_irq(pin as usize, true);
526                            ioapic.service_irq(pin as usize, false);
527                        }
528                    } else {
529                        let mut delayed_events = self.delayed_ioapic_irq_events.lock();
530                        delayed_events.events.push(event_index);
531                        delayed_events.trigger.signal().unwrap();
532                    }
533                }
534                IrqSource::Msi { address, data } => self.send_msi(address as u32, data),
535                _ => {
536                    error!("Unexpected route source {:?}", route);
537                    return Err(Error::new(libc::EINVAL));
538                }
539            }
540        }
541
542        Ok(())
543    }
544
545    /// Broadcasts an end of interrupt.  For UserspaceIrqChip this sends the EOI to the ioapic.
546    fn broadcast_eoi(&self, vector: u8) -> Result<()> {
547        self.ioapic.lock().end_of_interrupt(vector);
548        Ok(())
549    }
550
551    /// Injects any pending interrupts for `vcpu`.
552    ///
553    /// For UserspaceIrqChip this:
554    ///   * Injects a PIC interrupt, if vcpu_id is 0 and vcpu is ready for interrupt
555    ///   * Injects an APIC fixed interrupt, if vcpu is ready for interrupt and PIC didn't inject
556    ///   * Injects APIC NMIs
557    ///   * Handles APIC INIT IPIs
558    ///   * Handles APIC SIPIs
559    ///   * Requests an interrupt window, if PIC or APIC still has pending interrupts for this vcpu
560    fn inject_interrupts(&self, vcpu: &dyn VcpuArch) -> Result<()> {
561        let vcpu_id = vcpu.id();
562        let mut vcpu_ready = vcpu.ready_for_interrupt();
563
564        let mut pic_needs_window = false;
565        if vcpu_id == 0 {
566            let mut pic = self.pic.lock();
567            if vcpu_ready {
568                if let Some(vector) = pic.get_external_interrupt() {
569                    vcpu.interrupt(vector)?;
570                    self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
571                    // Already injected a PIC interrupt, so APIC fixed interrupt can't be injected.
572                    vcpu_ready = false;
573                }
574            }
575            pic_needs_window = pic.interrupt_requested();
576        }
577
578        let irqs = self.apics[vcpu_id].lock().get_pending_irqs(vcpu_ready);
579        if let Some(vector) = irqs.fixed {
580            let do_interrupt = {
581                let mut apic = self.apics[vcpu_id].lock();
582                match apic.get_mp_state() {
583                    MPState::Runnable | MPState::Halted => {
584                        // APIC interrupts should only be injectable when the MPState is
585                        // Halted or Runnable.
586                        apic.set_mp_state(&MPState::Runnable);
587                        true
588                    }
589                    s => {
590                        // This shouldn't happen, but log a helpful error if it does.
591                        error!("Interrupt cannot be injected while in state: {:?}", s);
592                        false
593                    }
594                }
595            };
596
597            if do_interrupt {
598                vcpu.interrupt(vector)?;
599            }
600        }
601        for _ in 0..irqs.nmis {
602            let prev_state = self.apics[vcpu_id].lock().get_mp_state();
603            vcpu.inject_nmi()?;
604            self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
605            info!(
606                "Delivered NMI to cpu {}, mp_state was {:?}, now is {:?}",
607                vcpu_id,
608                prev_state,
609                MPState::Runnable
610            );
611        }
612        if irqs.init {
613            {
614                let mut apic = self.apics[vcpu_id].lock();
615                apic.load_reset_state();
616                apic.set_mp_state(&MPState::InitReceived);
617            }
618            info!("Delivered INIT IPI to cpu {}", vcpu_id);
619        }
620        if let Some(vector) = irqs.startup {
621            // If our state is not MPState::InitReceived then this is probably
622            // the second SIPI in the INIT-SIPI-SIPI sequence; ignore.
623            if self.apics[vcpu_id].lock().get_mp_state() == MPState::InitReceived {
624                self.deliver_startup(vcpu, vector)?;
625                self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
626                info!("Delivered SIPI to cpu {}", vcpu_id);
627            }
628        }
629
630        let needs_window = pic_needs_window || irqs.needs_window;
631        vcpu.set_interrupt_window_requested(needs_window);
632
633        Ok(())
634    }
635
636    /// Notifies the irq chip that the specified VCPU has executed a halt instruction.
637    /// For `UserspaceIrqChip`, it sets the APIC's mp_state to `MPState::Halted`.
638    fn halted(&self, vcpu_id: usize) {
639        self.apics[vcpu_id].lock().set_mp_state(&MPState::Halted)
640    }
641
642    /// Blocks until `vcpu` is in a runnable state or until interrupted by
643    /// `IrqChip::kick_halted_vcpus`.  Returns `VcpuRunState::Runnable if vcpu is runnable, or
644    /// `VcpuRunState::Interrupted` if the wait was interrupted.
645    /// For `UserspaceIrqChip`, if the APIC isn't `MPState::Runnable`, sleep until there are new
646    /// interrupts pending on the APIC, inject the interrupts, and go back to sleep if still not
647    /// runnable.
648    fn wait_until_runnable(&self, vcpu: &dyn VcpuArch) -> Result<VcpuRunState> {
649        let vcpu_id = vcpu.id();
650        let waiter = &self.waiters[vcpu_id];
651        let mut interrupted_lock = waiter.mtx.lock();
652        loop {
653            if *interrupted_lock {
654                *interrupted_lock = false;
655                info!("wait_until_runnable interrupted on cpu {}", vcpu_id);
656                return Ok(VcpuRunState::Interrupted);
657            }
658            if self.is_runnable(vcpu_id) {
659                return Ok(VcpuRunState::Runnable);
660            }
661
662            self.inject_interrupts(vcpu)?;
663            if self.is_runnable(vcpu_id) {
664                return Ok(VcpuRunState::Runnable);
665            }
666            interrupted_lock = waiter.cvar.wait(interrupted_lock);
667        }
668    }
669
670    /// Makes unrunnable VCPUs return immediately from `wait_until_runnable`.
671    /// For UserspaceIrqChip, every vcpu gets kicked so its current or next call to
672    /// `wait_until_runnable` will immediately return false.  After that one kick, subsequent
673    /// `wait_until_runnable` calls go back to waiting for runnability normally.
674    fn kick_halted_vcpus(&self) {
675        for waiter in self.waiters.iter() {
676            waiter.set_and_notify(/* interrupted= */ true);
677        }
678    }
679
680    fn get_mp_state(&self, vcpu_id: usize) -> Result<MPState> {
681        Ok(self.apics[vcpu_id].lock().get_mp_state())
682    }
683
684    fn set_mp_state(&self, vcpu_id: usize, state: &MPState) -> Result<()> {
685        self.apics[vcpu_id].lock().set_mp_state(state);
686        Ok(())
687    }
688
689    // TODO(srichman): factor out UserspaceIrqChip and KvmSplitIrqChip::finalize_devices
690    fn finalize_devices(
691        self: Arc<Self>,
692        resources: &mut SystemAllocator,
693        io_bus: &Bus,
694        mmio_bus: &Bus,
695    ) -> Result<()> {
696        // Insert pit into io_bus
697        io_bus.insert(self.pit.clone(), 0x040, 0x8).unwrap();
698        io_bus.insert(self.pit.clone(), 0x061, 0x1).unwrap();
699
700        // Insert pic into io_bus
701        io_bus.insert(self.pic.clone(), 0x20, 0x2).unwrap();
702        io_bus.insert(self.pic.clone(), 0xa0, 0x2).unwrap();
703        io_bus.insert(self.pic.clone(), 0x4d0, 0x2).unwrap();
704
705        // Insert ioapic into mmio_bus
706        mmio_bus
707            .insert(
708                self.ioapic.clone(),
709                IOAPIC_BASE_ADDRESS,
710                IOAPIC_MEM_LENGTH_BYTES,
711            )
712            .unwrap();
713
714        // Insert self into mmio_bus for handling APIC mmio
715        mmio_bus
716            .insert_sync(self.clone(), APIC_BASE_ADDRESS, APIC_MEM_LENGTH_BYTES)
717            .unwrap();
718
719        // At this point, all of our devices have been created and they have registered their
720        // irq events, so we can clone our resample events
721        let mut ioapic_resample_events: Vec<Vec<Event>> =
722            (0..self.ioapic_pins).map(|_| Vec::new()).collect();
723        let mut pic_resample_events: Vec<Vec<Event>> =
724            (0..self.ioapic_pins).map(|_| Vec::new()).collect();
725
726        for evt in self.irq_events.lock().iter().flatten() {
727            if (evt.gsi as usize) >= self.ioapic_pins {
728                continue;
729            }
730            if let Some(resample_evt) = &evt.resample_event {
731                ioapic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?);
732                pic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?);
733            }
734        }
735
736        // Register resample events with the ioapic
737        self.ioapic
738            .lock()
739            .register_resample_events(ioapic_resample_events);
740        // Register resample events with the pic
741        self.pic
742            .lock()
743            .register_resample_events(pic_resample_events);
744
745        // Make sure all future irq numbers are >= self.ioapic_pins
746        let mut irq_num = resources.allocate_irq().unwrap();
747        while irq_num < self.ioapic_pins as u32 {
748            irq_num = resources.allocate_irq().unwrap();
749        }
750
751        // Spawn timer threads here instead of in new(), in case crosvm is in sandbox mode.
752        self.activated.store(true, Ordering::Relaxed);
753        let _ = self.wake_internal();
754
755        Ok(())
756    }
757
758    /// The UserspaceIrqChip's ioapic may be locked because a vcpu thread is currently writing to
759    /// the ioapic, and the ioapic may be blocking on adding MSI routes, which requires blocking
760    /// tube communication back to the main thread.  Thus, we do not want the main thread to
761    /// block on a locked ioapic, so any irqs that could not be serviced because the ioapic could
762    /// not be immediately locked are added to the delayed_ioapic_irq_events Vec. This function
763    /// processes each delayed event in the vec each time it's called. If the ioapic is still
764    /// locked, we keep the queued irqs for the next time this function is called.
765    fn process_delayed_irq_events(&self) -> Result<()> {
766        let irq_events = self.irq_events.lock();
767        let mut delayed_events = self.delayed_ioapic_irq_events.lock();
768        delayed_events.events.retain(|&event_index| {
769            if let Some(evt) = &irq_events[event_index] {
770                if let Ok(mut ioapic) = self.ioapic.try_lock() {
771                    if evt.resample_event.is_some() {
772                        ioapic.service_irq(evt.gsi as usize, true);
773                    } else {
774                        ioapic.service_irq(evt.gsi as usize, true);
775                        ioapic.service_irq(evt.gsi as usize, false);
776                    }
777
778                    false
779                } else {
780                    true
781                }
782            } else {
783                true
784            }
785        });
786
787        if delayed_events.events.is_empty() {
788            delayed_events.trigger.wait()?;
789        }
790        Ok(())
791    }
792
793    fn irq_delayed_event_token(&self) -> Result<Option<Event>> {
794        Ok(Some(
795            self.delayed_ioapic_irq_events.lock().trigger.try_clone()?,
796        ))
797    }
798
799    fn check_capability(&self, c: IrqChipCap) -> bool {
800        match c {
801            IrqChipCap::TscDeadlineTimer => false,
802            IrqChipCap::X2Apic => false,
803            IrqChipCap::MpStateGetSet => true,
804        }
805    }
806}
807
808impl BusDevice for UserspaceIrqChip {
809    fn debug_label(&self) -> String {
810        "UserspaceIrqChip APIC".to_string()
811    }
812    fn device_id(&self) -> DeviceId {
813        PlatformDeviceId::UserspaceIrqChip.into()
814    }
815}
816
817impl Suspendable for UserspaceIrqChip {
818    fn sleep(&mut self) -> anyhow::Result<()> {
819        // TODO: This is never called because `UserspaceIrqChip` is a `BusDeviceSync`. We should be
820        // implementing `sleep_sync` and friends instead.
821        for thread in self.workers.lock().split_off(0).into_iter() {
822            thread.stop();
823        }
824        Ok(())
825    }
826
827    fn wake(&mut self) -> anyhow::Result<()> {
828        self.wake_internal()
829    }
830}
831
832impl BusDeviceSync for UserspaceIrqChip {
833    fn read(&self, info: BusAccessInfo, data: &mut [u8]) {
834        self.apics[info.id].lock().read(info.offset, data)
835    }
836    fn write(&self, info: BusAccessInfo, data: &[u8]) {
837        let msg = self.apics[info.id].lock().write(info.offset, data);
838        if let Some(m) = msg {
839            self.handle_msg(m);
840        }
841    }
842}
843
844impl IrqChipX86_64 for UserspaceIrqChip {
845    fn get_pic_state(&self, select: PicSelect) -> Result<PicState> {
846        Ok(self.pic.lock().get_pic_state(select))
847    }
848
849    fn set_pic_state(&self, select: PicSelect, state: &PicState) -> Result<()> {
850        self.pic.lock().set_pic_state(select, state);
851        Ok(())
852    }
853
854    fn get_ioapic_state(&self) -> Result<IoapicState> {
855        Ok(self.ioapic.lock().get_ioapic_state())
856    }
857
858    fn set_ioapic_state(&self, state: &IoapicState) -> Result<()> {
859        self.ioapic.lock().set_ioapic_state(state);
860        Ok(())
861    }
862
863    fn get_lapic_state(&self, vcpu_id: usize) -> Result<LapicState> {
864        Ok(self.apics[vcpu_id].lock().get_state())
865    }
866
867    fn set_lapic_state(&self, vcpu_id: usize, state: &LapicState) -> Result<()> {
868        self.apics[vcpu_id].lock().set_state(state);
869        Ok(())
870    }
871
872    /// Get the lapic frequency in Hz
873    fn lapic_frequency(&self) -> u32 {
874        Apic::frequency()
875    }
876
877    fn get_pit(&self) -> Result<PitState> {
878        Ok(self.pit.lock().get_pit_state())
879    }
880
881    fn set_pit(&self, state: &PitState) -> Result<()> {
882        self.pit.lock().set_pit_state(state);
883        Ok(())
884    }
885
886    /// Returns true if the PIT uses port 0x61 for the PC speaker, false if 0x61 is unused.
887    /// devices::Pit uses 0x61.
888    fn pit_uses_speaker_port(&self) -> bool {
889        true
890    }
891
892    fn snapshot_chip_specific(&self) -> anyhow::Result<AnySnapshot> {
893        Err(anyhow::anyhow!("Not supported yet in userspace"))
894    }
895    fn restore_chip_specific(&self, _data: AnySnapshot) -> anyhow::Result<()> {
896        Err(anyhow::anyhow!("Not supported yet in userspace"))
897    }
898}
899
900/// Condition variable used by `UserspaceIrqChip::wait_until_runnable`.
901#[derive(Default)]
902struct Waiter {
903    // mtx stores an "interrupted" bool that's true if `kick_halted_vcpus` has been called.
904    mtx: Mutex<bool>,
905    cvar: Condvar,
906}
907
908impl Waiter {
909    /// Wakes up `wait_until_runnable` to recheck the interrupted flag and vcpu runnable state.
910    pub fn notify(&self) {
911        let _lock = self.mtx.lock();
912        self.cvar.notify_all();
913    }
914
915    /// Sets the interrupted flag, and wakes up `wait_until_runnable` to recheck the interrupted
916    /// flag and vcpu runnable state.  If `interrupted` is true, then `wait_until_runnable` should
917    /// stop waiting for a runnable vcpu and return immediately.
918    pub fn set_and_notify(&self, interrupted: bool) {
919        let mut interrupted_lock = self.mtx.lock();
920        *interrupted_lock = interrupted;
921        self.cvar.notify_all();
922    }
923}
924
925/// Worker thread for polling timer events and sending them to an APIC.
926struct TimerWorker {
927    id: usize,
928    apic: Arc<Mutex<Apic>>,
929    vcpus: Arc<Mutex<Vec<Option<Arc<dyn VcpuX86_64>>>>>,
930    descriptor: Descriptor,
931    waiter: Arc<Waiter>,
932}
933
934impl TimerWorker {
935    fn run(&mut self, kill_evt: Event) -> TimerWorkerResult<()> {
936        #[derive(EventToken)]
937        enum Token {
938            // The timer expired.
939            TimerExpire,
940            // The parent thread requested an exit.
941            Kill,
942        }
943
944        let wait_ctx: WaitContext<Token> = WaitContext::build_with(&[
945            (&self.descriptor, Token::TimerExpire),
946            (&kill_evt, Token::Kill),
947        ])
948        .map_err(TimerWorkerError::CreateWaitContext)?;
949
950        loop {
951            let events = wait_ctx.wait().map_err(TimerWorkerError::WaitError)?;
952            for event in events.iter().filter(|e| e.is_readable) {
953                match event.token {
954                    Token::TimerExpire => {
955                        self.apic.lock().handle_timer_expiration();
956                        if let Some(Some(vcpu)) = self.vcpus.lock().get(self.id) {
957                            vcpu.set_interrupt_window_requested(true);
958                        }
959                        self.waiter.notify();
960                    }
961                    Token::Kill => return Ok(()),
962                }
963            }
964        }
965    }
966}
967
968#[derive(Debug)]
969enum TimerWorkerError {
970    /// Creating WaitContext failed.
971    CreateWaitContext(Error),
972    /// Error while waiting for events.
973    WaitError(Error),
974}
975
976impl Display for TimerWorkerError {
977    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
978        use self::TimerWorkerError::*;
979
980        match self {
981            CreateWaitContext(e) => write!(f, "failed to create event context: {e}"),
982            WaitError(e) => write!(f, "failed to wait for events: {e}"),
983        }
984    }
985}
986
987impl std::error::Error for TimerWorkerError {}
988
989type TimerWorkerResult<T> = std::result::Result<T, TimerWorkerError>;