devices/
virtcpufreq_v2.rs

1// Copyright 2024 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::fs::File;
6use std::path::PathBuf;
7use std::sync::atomic::AtomicU32;
8use std::sync::atomic::Ordering;
9use std::sync::Arc;
10use std::time::Duration;
11
12use anyhow::Context;
13use base::sched_attr;
14use base::sched_setattr;
15use base::set_cpu_affinity;
16use base::warn;
17use base::Error;
18use base::Event;
19use base::EventToken;
20use base::Timer;
21use base::TimerTrait;
22use base::Tube;
23use base::WaitContext;
24use base::WorkerThread;
25use sync::Mutex;
26
27use crate::pci::CrosvmDeviceId;
28use crate::BusAccessInfo;
29use crate::BusDevice;
30use crate::DeviceId;
31use crate::Suspendable;
32
33const CPUFREQ_GOV_SCALE_FACTOR_DEFAULT: u32 = 100;
34const CPUFREQ_GOV_SCALE_FACTOR_SCHEDUTIL: u32 = 80;
35
36const SCHED_FLAG_RESET_ON_FORK: u64 = 0x1;
37const SCHED_FLAG_KEEP_POLICY: u64 = 0x08;
38const SCHED_FLAG_KEEP_PARAMS: u64 = 0x10;
39const SCHED_FLAG_UTIL_CLAMP_MIN: u64 = 0x20;
40const SCHED_FLAG_UTIL_CLAMP_MAX: u64 = 0x40;
41
42const VCPUFREQ_CUR_PERF: u32 = 0x0;
43const VCPUFREQ_SET_PERF: u32 = 0x4;
44const VCPUFREQ_FREQTBL_LEN: u32 = 0x8;
45const VCPUFREQ_FREQTBL_SEL: u32 = 0xc;
46const VCPUFREQ_FREQTBL_RD: u32 = 0x10;
47const VCPUFREQ_PERF_DOMAIN: u32 = 0x14;
48
49const SCHED_FLAG_KEEP_ALL: u64 = SCHED_FLAG_KEEP_POLICY | SCHED_FLAG_KEEP_PARAMS;
50const SCHED_CAPACITY_SCALE: u32 = 1024;
51
52// Timer values in microseconds
53const MIN_TIMER_US: u32 = 75;
54const TIMER_OVERHEAD_US: u32 = 15;
55
56/// Upstream linux compatible version of the virtual cpufreq interface
57pub struct VirtCpufreqV2 {
58    vcpu_freq_table: Vec<u32>,
59    pcpu_fmax: u32,
60    pcpu_capacity: u32,
61    pcpu: u32,
62    util_factor: u32,
63    freqtbl_sel: u32,
64    vcpu_domain: u32,
65    domain_uclamp_min: Option<File>,
66    domain_uclamp_max: Option<File>,
67    vcpu_fmax: u32,
68    vcpu_capacity: u32,
69    vcpu_relative_capacity: u32,
70    worker: Option<WorkerThread<()>>,
71    timer: Arc<Mutex<Timer>>,
72    vm_ctrl: Arc<Mutex<Tube>>,
73    pcpu_min_cap: u32,
74    /// The largest(or the last) pCPU index to be used by all the vCPUs. This index is used to
75    /// figure out the proper placement of the throttle workers which are placed on pCPUs right
76    /// after the last pCPU being used the vCPUs. Throttle workers require their own exclusive
77    /// pCPU allocation and this ensure that the workers are placed contiguously and makes it
78    /// easier for user to manage pCPU allocations when running multiple instances on a large
79    /// server.
80    largest_pcpu_idx: usize,
81    //TODO: Put the shared_domain_members in a struct
82    shared_domain_vcpus: Vec<usize>,
83    shared_domain_perf: Arc<AtomicU32>,
84}
85
86fn get_cpu_info(cpu_id: u32, property: &str) -> Result<u32, Error> {
87    let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
88    std::fs::read_to_string(path)?
89        .trim()
90        .parse()
91        .map_err(|_| Error::new(libc::EINVAL))
92}
93
94fn get_cpu_info_str(cpu_id: u32, property: &str) -> Result<String, Error> {
95    let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
96    std::fs::read_to_string(path).map_err(|_| Error::new(libc::EINVAL))
97}
98
99fn get_cpu_capacity(cpu_id: u32) -> Result<u32, Error> {
100    get_cpu_info(cpu_id, "cpu_capacity")
101}
102
103fn get_cpu_maxfreq_khz(cpu_id: u32) -> Result<u32, Error> {
104    get_cpu_info(cpu_id, "cpufreq/cpuinfo_max_freq")
105}
106
107fn get_cpu_minfreq_khz(cpu_id: u32) -> Result<u32, Error> {
108    get_cpu_info(cpu_id, "cpufreq/cpuinfo_min_freq")
109}
110
111fn get_cpu_curfreq_khz(cpu_id: u32) -> Result<u32, Error> {
112    get_cpu_info(cpu_id, "cpufreq/scaling_cur_freq")
113}
114
115fn get_cpu_util_factor(cpu_id: u32) -> Result<u32, Error> {
116    let gov = get_cpu_info_str(cpu_id, "cpufreq/scaling_governor")?;
117    match gov.trim() {
118        "schedutil" => Ok(CPUFREQ_GOV_SCALE_FACTOR_SCHEDUTIL),
119        _ => Ok(CPUFREQ_GOV_SCALE_FACTOR_DEFAULT),
120    }
121}
122
123impl VirtCpufreqV2 {
124    pub fn new(
125        pcpu: u32,
126        vcpu_freq_table: Vec<u32>,
127        vcpu_domain_path: Option<PathBuf>,
128        vcpu_domain: u32,
129        vcpu_capacity: u32,
130        largest_pcpu_idx: usize,
131        vm_ctrl: Arc<Mutex<Tube>>,
132        shared_domain_vcpus: Vec<usize>,
133        shared_domain_perf: Arc<AtomicU32>,
134    ) -> Self {
135        let pcpu_capacity = get_cpu_capacity(pcpu).expect("Error reading capacity");
136        let pcpu_fmax = get_cpu_maxfreq_khz(pcpu).expect("Error reading max freq");
137        let util_factor = get_cpu_util_factor(pcpu).expect("Error getting util factor");
138        let freqtbl_sel = 0;
139        let mut domain_uclamp_min = None;
140        let mut domain_uclamp_max = None;
141        // The vcpu_capacity passed in is normalized for frequency, reverse the normalization to
142        // get the performance per clock ratio between the vCPU and the pCPU its running on. This
143        // "relative capacity" is an approximation of the delta in IPC (Instructions per Cycle)
144        // between the pCPU vs vCPU running a usecase containing a mix of instruction types.
145        let vcpu_fmax = vcpu_freq_table.clone().into_iter().max().unwrap();
146        let vcpu_relative_capacity =
147            u32::try_from(u64::from(vcpu_capacity) * u64::from(pcpu_fmax) / u64::from(vcpu_fmax))
148                .unwrap();
149        let pcpu_min_cap =
150            get_cpu_minfreq_khz(pcpu).expect("Error reading min freq") * pcpu_capacity / pcpu_fmax;
151
152        if let Some(cgroup_path) = &vcpu_domain_path {
153            domain_uclamp_min = Some(
154                File::create(cgroup_path.join("cpu.uclamp.min")).unwrap_or_else(|err| {
155                    panic!(
156                        "Err: {}, Unable to open: {}",
157                        err,
158                        cgroup_path.join("cpu.uclamp.min").display()
159                    )
160                }),
161            );
162            domain_uclamp_max = Some(
163                File::create(cgroup_path.join("cpu.uclamp.max")).unwrap_or_else(|err| {
164                    panic!(
165                        "Err: {}, Unable to open: {}",
166                        err,
167                        cgroup_path.join("cpu.uclamp.max").display()
168                    )
169                }),
170            );
171        }
172
173        VirtCpufreqV2 {
174            vcpu_freq_table,
175            pcpu_fmax,
176            pcpu_capacity,
177            pcpu,
178            util_factor,
179            freqtbl_sel,
180            vcpu_domain,
181            domain_uclamp_min,
182            domain_uclamp_max,
183            vcpu_fmax,
184            vcpu_capacity,
185            vcpu_relative_capacity,
186            worker: None,
187            timer: Arc::new(Mutex::new(Timer::new().expect("failed to create Timer"))),
188            vm_ctrl,
189            pcpu_min_cap,
190            largest_pcpu_idx,
191            shared_domain_vcpus,
192            shared_domain_perf,
193        }
194    }
195}
196
197impl BusDevice for VirtCpufreqV2 {
198    fn device_id(&self) -> DeviceId {
199        CrosvmDeviceId::VirtCpufreq.into()
200    }
201
202    fn debug_label(&self) -> String {
203        "VirtCpufreq Device".to_owned()
204    }
205
206    fn read(&mut self, info: BusAccessInfo, data: &mut [u8]) {
207        if data.len() != std::mem::size_of::<u32>() {
208            warn!(
209                "{}: unsupported read length {}, only support 4bytes read",
210                self.debug_label(),
211                data.len()
212            );
213            return;
214        }
215
216        let val = match info.offset as u32 {
217            VCPUFREQ_CUR_PERF => {
218                let shared_util = self.shared_domain_perf.load(Ordering::SeqCst);
219                if shared_util != 0 && shared_util < self.pcpu_min_cap {
220                    shared_util * self.vcpu_fmax / self.vcpu_capacity
221                } else {
222                    match get_cpu_curfreq_khz(self.pcpu) {
223                        Ok(freq) => u32::try_from(
224                            u64::from(freq) * u64::from(self.pcpu_capacity)
225                                / u64::from(self.vcpu_relative_capacity),
226                        )
227                        .unwrap(),
228                        Err(_) => 0,
229                    }
230                }
231            }
232            VCPUFREQ_FREQTBL_LEN => self.vcpu_freq_table.len() as u32,
233            VCPUFREQ_PERF_DOMAIN => self.vcpu_domain,
234            VCPUFREQ_FREQTBL_RD => *self
235                .vcpu_freq_table
236                .get(self.freqtbl_sel as usize)
237                .unwrap_or(&0),
238            _ => {
239                warn!("{}: unsupported read address {}", self.debug_label(), info);
240                return;
241            }
242        };
243
244        let val_arr = val.to_ne_bytes();
245        data.copy_from_slice(&val_arr);
246    }
247
248    fn write(&mut self, info: BusAccessInfo, data: &[u8]) {
249        let val: u32 = match data.try_into().map(u32::from_ne_bytes) {
250            Ok(v) => v,
251            Err(e) => {
252                warn!(
253                    "{}: unsupported write length {:#}, only support 4bytes write",
254                    self.debug_label(),
255                    e
256                );
257                return;
258            }
259        };
260
261        match info.offset as u32 {
262            VCPUFREQ_SET_PERF => {
263                // Util margin depends on the cpufreq governor on the host
264                let util_raw = match u32::try_from(
265                    u64::from(self.vcpu_capacity) * u64::from(val) / u64::from(self.vcpu_fmax),
266                ) {
267                    Ok(util) => util,
268                    Err(e) => {
269                        warn!("Potential overflow {:#}", e);
270                        SCHED_CAPACITY_SCALE
271                    }
272                };
273
274                let util = util_raw * self.util_factor / CPUFREQ_GOV_SCALE_FACTOR_DEFAULT;
275
276                if let (Some(domain_uclamp_min), Some(domain_uclamp_max)) =
277                    (&mut self.domain_uclamp_min, &mut self.domain_uclamp_max)
278                {
279                    use std::io::Write;
280                    let val = util as f32 * 100.0 / SCHED_CAPACITY_SCALE as f32;
281                    let val_formatted = format!("{val:4}").into_bytes();
282
283                    if self.vcpu_fmax != self.pcpu_fmax {
284                        if let Err(e) = domain_uclamp_max.write(&val_formatted) {
285                            warn!("Error setting uclamp_max: {:#}", e);
286                        }
287                    }
288                    if let Err(e) = domain_uclamp_min.write(&val_formatted) {
289                        warn!("Error setting uclamp_min: {:#}", e);
290                    }
291                } else {
292                    let mut sched_attr = sched_attr {
293                        sched_flags: SCHED_FLAG_KEEP_ALL
294                            | SCHED_FLAG_UTIL_CLAMP_MIN
295                            | SCHED_FLAG_UTIL_CLAMP_MAX
296                            | SCHED_FLAG_RESET_ON_FORK,
297                        sched_util_min: util,
298                        ..Default::default()
299                    };
300
301                    if self.vcpu_fmax != self.pcpu_fmax {
302                        sched_attr.sched_util_max = util;
303                    } else {
304                        sched_attr.sched_util_max = SCHED_CAPACITY_SCALE;
305                    }
306
307                    if let Err(e) = sched_setattr(0, &mut sched_attr, 0) {
308                        panic!("{}: Error setting util value: {:#}", self.debug_label(), e);
309                    }
310                }
311
312                // Return early if vcpu_fmax matches pcpu_fmax as that denotes no vCPU throttling
313                // is required.
314                if self.vcpu_fmax == self.pcpu_fmax {
315                    return;
316                }
317
318                self.shared_domain_perf.store(util_raw, Ordering::SeqCst);
319                let timer = self.timer.clone();
320                if self.worker.is_none() {
321                    let vcpu_id = info.id;
322                    let vm_ctrl = self.vm_ctrl.clone();
323                    let worker_cpu_affinity = self.largest_pcpu_idx + self.vcpu_domain as usize + 1;
324                    let shared_domain_vcpus = self.shared_domain_vcpus.clone();
325
326                    self.worker = Some(WorkerThread::start(
327                        format!("vcpu_throttle{vcpu_id}"),
328                        move |kill_evt| {
329                            vcpufreq_worker_thread(
330                                shared_domain_vcpus,
331                                kill_evt,
332                                timer,
333                                vm_ctrl,
334                                worker_cpu_affinity,
335                            )
336                            .expect("error running vpucfreq_worker")
337                        },
338                    ));
339                } else if util_raw < self.pcpu_min_cap {
340                    // The period is porportional to the performance requested by the vCPU, we
341                    // reduce the timeout period to increase the amount of throttling applied to
342                    // the vCPU as the performance decreases. Ex. If vCPU requests half of the
343                    // performance relatively to its pCPU@FMin, the vCPU will spend 50% of its
344                    // cycles being throttled to increase time for the same workload that otherwise
345                    // would've taken 1/2 of the time if ran at pCPU@FMin. We could've
346                    // alternatively adjusted the workload and used some fixed period (such as
347                    // 250us), but there's a floor for the minimum delay we add (cost of handling
348                    // the userspace exit) and limits the range of performance we can emulate.
349                    let timeout_period = (MIN_TIMER_US + TIMER_OVERHEAD_US) as f32
350                        / (1.0 - (util_raw as f32 / self.pcpu_min_cap as f32));
351                    let _ = timer
352                        .lock()
353                        .reset_repeating(Duration::from_micros(timeout_period as u64));
354                } else {
355                    let _ = timer.lock().clear();
356                }
357            }
358            VCPUFREQ_FREQTBL_SEL => self.freqtbl_sel = val,
359            _ => {
360                warn!("{}: unsupported read address {}", self.debug_label(), info);
361            }
362        }
363    }
364}
365
366pub fn vcpufreq_worker_thread(
367    shared_domain_vcpus: Vec<usize>,
368    kill_evt: Event,
369    timer: Arc<Mutex<Timer>>,
370    vm_ctrl: Arc<Mutex<Tube>>,
371    cpu_affinity: usize,
372) -> anyhow::Result<()> {
373    #[derive(EventToken)]
374    enum Token {
375        // The timer expired.
376        TimerExpire,
377        // The parent thread requested an exit.
378        Kill,
379    }
380
381    let wait_ctx = WaitContext::build_with(&[
382        (&*timer.lock(), Token::TimerExpire),
383        (&kill_evt, Token::Kill),
384    ])
385    .context("Failed to create wait_ctx")?;
386
387    // The vcpufreq thread has strict scheduling requirements, let's affine it away from the vCPU
388    // threads and clamp its util to high value.
389    let cpu_set: Vec<usize> = vec![cpu_affinity];
390    set_cpu_affinity(cpu_set)?;
391
392    let mut sched_attr = sched_attr {
393        sched_flags: SCHED_FLAG_KEEP_ALL
394            | SCHED_FLAG_UTIL_CLAMP_MIN
395            | SCHED_FLAG_UTIL_CLAMP_MAX
396            | SCHED_FLAG_RESET_ON_FORK,
397        sched_util_min: SCHED_CAPACITY_SCALE,
398        sched_util_max: SCHED_CAPACITY_SCALE,
399        ..Default::default()
400    };
401    if let Err(e) = sched_setattr(0, &mut sched_attr, 0) {
402        warn!("Error setting util value: {}", e);
403    }
404
405    loop {
406        let events = wait_ctx.wait().context("Failed to wait for events")?;
407        for event in events.iter().filter(|e| e.is_readable) {
408            match event.token {
409                Token::TimerExpire => {
410                    timer
411                        .lock()
412                        .mark_waited()
413                        .context("failed to reset timer")?;
414                    let vm_ctrl_unlocked = vm_ctrl.lock();
415                    for vcpu_id in &shared_domain_vcpus {
416                        let msg = vm_control::VmRequest::Throttle(*vcpu_id, MIN_TIMER_US);
417                        vm_ctrl_unlocked
418                            .send(&msg)
419                            .context("failed to stall vCPUs")?;
420                    }
421                }
422                Token::Kill => {
423                    return Ok(());
424                }
425            }
426        }
427    }
428}
429
430impl Suspendable for VirtCpufreqV2 {}