devices/virtio/vhost_user_backend/fs/sys/
linux.rs

1// Copyright 2022 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::os::fd::OwnedFd;
6use std::os::unix::process::ExitStatusExt;
7use std::path::Path;
8use std::path::PathBuf;
9
10use anyhow::bail;
11use anyhow::Context;
12use base::error;
13use base::linux::max_open_files;
14use base::sys::wait_for_pid;
15use base::AsRawDescriptor;
16use base::AsRawDescriptors;
17use base::IntoRawDescriptor;
18use base::RawDescriptor;
19use base::UnixSeqpacketListener;
20use cros_async::Executor;
21use jail::create_base_minijail;
22use jail::create_base_minijail_without_pivot_root;
23use minijail::Minijail;
24
25use crate::virtio::vhost_user_backend::fs::FsBackend;
26use crate::virtio::vhost_user_backend::fs::Options;
27use crate::virtio::vhost_user_backend::BackendConnection;
28
29fn default_uidmap() -> String {
30    // SAFETY: trivially safe
31    let euid = unsafe { libc::geteuid() };
32    format!("{euid} {euid} 1")
33}
34
35fn default_gidmap() -> String {
36    // SAFETY: trivially safe
37    let egid = unsafe { libc::getegid() };
38    format!("{egid} {egid} 1")
39}
40
41#[allow(clippy::unnecessary_cast)]
42fn jail_and_fork(
43    mut keep_rds: Vec<RawDescriptor>,
44    dir_path: PathBuf,
45    uid: u32,
46    gid: u32,
47    uid_map: Option<String>,
48    gid_map: Option<String>,
49    disable_sandbox: bool,
50    pivot_root: bool,
51) -> anyhow::Result<i32> {
52    let limit = max_open_files()
53        .context("failed to get max open files")?
54        .rlim_max;
55    // Create new minijail sandbox
56    let jail = if disable_sandbox {
57        if pivot_root {
58            create_base_minijail(dir_path.as_path(), limit)
59        } else {
60            create_base_minijail_without_pivot_root(dir_path.as_path(), limit)
61        }?
62    } else {
63        let mut j: Minijail = Minijail::new()?;
64
65        j.namespace_pids();
66        j.namespace_user();
67        j.namespace_user_disable_setgroups();
68        if uid != 0 {
69            j.change_uid(uid);
70        }
71        if gid != 0 {
72            j.change_gid(gid);
73        }
74        j.uidmap(&uid_map.unwrap_or_else(default_uidmap))?;
75        j.gidmap(&gid_map.unwrap_or_else(default_gidmap))?;
76        j.run_as_init();
77
78        j.namespace_vfs();
79        j.namespace_net();
80        j.no_new_privs();
81
82        // Only pivot_root if we are not re-using the current root directory.
83        if dir_path != Path::new("/") {
84            // It's safe to call `namespace_vfs` multiple times.
85            j.namespace_vfs();
86            j.enter_pivot_root(&dir_path)?;
87        }
88        j.set_remount_mode(libc::MS_SLAVE);
89
90        j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)?;
91        // vvu locks around 512k memory. Just give 1M.
92        j.set_rlimit(libc::RLIMIT_MEMLOCK as i32, 1 << 20, 1 << 20)?;
93        #[cfg(not(feature = "seccomp_trace"))]
94        jail::set_embedded_bpf_program(&mut j, "fs_device_vhost_user")?;
95        j.use_seccomp_filter();
96        j
97    };
98
99    // Make sure there are no duplicates in keep_rds
100    keep_rds.sort_unstable();
101    keep_rds.dedup();
102
103    // fork on the jail here
104    // SAFETY: trivially safe
105    let pid = unsafe { jail.fork(Some(&keep_rds))? };
106
107    if pid > 0 {
108        // Current FS driver jail does not use seccomp and jail_and_fork() does not have other
109        // users, so we do nothing here for seccomp_trace
110        // SAFETY: trivially safe
111        unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) };
112    }
113
114    if pid < 0 {
115        bail!("Fork error! {}", std::io::Error::last_os_error());
116    }
117
118    Ok(pid)
119}
120
121/// Starts a vhost-user fs device.
122/// Returns an error if the given `args` is invalid or the device fails to run.
123#[allow(unused_mut)]
124pub fn start_device(mut opts: Options) -> anyhow::Result<()> {
125    #[allow(unused_mut)]
126    let mut is_pivot_root_required = true;
127    #[cfg(feature = "fs_runtime_ugid_map")]
128    if let Some(ref mut cfg) = opts.cfg {
129        if !cfg.ugid_map.is_empty() && (!opts.disable_sandbox || !opts.skip_pivot_root) {
130            bail!("uid_gid_map can only be set with disable sandbox and skip_pivot_root option");
131        }
132
133        if cfg.unmap_guest_memory_on_fork && !opts.disable_sandbox {
134            bail!("unmap_guest_memory_on_fork requires --disable-sandbox");
135        }
136
137        if opts.skip_pivot_root {
138            is_pivot_root_required = false;
139        }
140    }
141    let ex = Executor::new().context("Failed to create executor")?;
142
143    let mut allowlist_listener_fd = None;
144    if let Some(ref path) = opts.allowlist_socket_path {
145        let listener =
146            UnixSeqpacketListener::bind(path).context("failed to bind allowlist socket")?;
147        let fd = OwnedFd::from(listener).into_raw_descriptor();
148        allowlist_listener_fd = Some(fd);
149    }
150
151    let mut fs_device = FsBackend::new(
152        &opts.tag,
153        opts.shared_dir
154            .to_str()
155            .expect("Failed to convert opts.shared_dir to str()"),
156        opts.skip_pivot_root,
157        opts.cfg,
158        allowlist_listener_fd,
159    )?;
160
161    let mut keep_rds = fs_device.keep_rds.clone();
162    keep_rds.append(&mut ex.as_raw_descriptors());
163
164    let conn =
165        BackendConnection::from_opts(opts.socket.as_deref(), opts.socket_path.as_deref(), opts.fd)?;
166    keep_rds.push(conn.as_raw_descriptor());
167    if let Some(fd) = allowlist_listener_fd {
168        keep_rds.push(fd);
169    }
170
171    base::syslog::push_descriptors(&mut keep_rds);
172    cros_tracing::push_descriptors!(&mut keep_rds);
173    metrics::push_descriptors(&mut keep_rds);
174    let pid = jail_and_fork(
175        keep_rds,
176        opts.shared_dir,
177        opts.uid,
178        opts.gid,
179        opts.uid_map,
180        opts.gid_map,
181        opts.disable_sandbox,
182        is_pivot_root_required,
183    )?;
184
185    match pid {
186        0 => {
187            // Child process runs the device and exits, not returns.
188            fs_device.start_allowlist_listener();
189            if let Err(e) = ex.run_until(conn.run_backend(fs_device, &ex)) {
190                error!("Error in vhost-user-fs device: {:#}", e);
191                std::process::exit(1);
192            }
193            std::process::exit(0);
194        }
195        pid if pid < 0 => {
196            unreachable!("fork error must have been handled in jail_and_fork()");
197        }
198        _ => {
199            // fs_device is not needed in the parent process.
200            drop(fs_device);
201
202            let (_child_pid, status) =
203                wait_for_pid(pid, 0).context("failed to wait for child process")?;
204            if let Some(signal) = status.signal() {
205                panic!("Child process {pid} was killed by signal {signal}");
206            }
207            if let Some(exit_code) = status.code() {
208                if exit_code != 0 {
209                    bail!("Child process {pid} exited with code {exit_code}");
210                }
211            }
212        }
213    };
214    Ok(())
215}