1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
// Copyright 2019 The ChromiumOS Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

//! VM disk image file format I/O.

use std::cmp::min;
use std::fmt::Debug;
use std::fs::File;
use std::io;
use std::io::Seek;
use std::io::SeekFrom;
use std::path::PathBuf;
use std::sync::Arc;

use async_trait::async_trait;
use base::info;
use base::AsRawDescriptors;
use base::FileAllocate;
use base::FileReadWriteAtVolatile;
use base::FileSetLen;
use cros_async::BackingMemory;
use cros_async::Executor;
use cros_async::IoSource;
use cros_async::MemRegionIter;
use thiserror::Error as ThisError;

mod asynchronous;
#[allow(unused)]
pub(crate) use asynchronous::AsyncDiskFileWrapper;
#[cfg(feature = "qcow")]
mod qcow;
#[cfg(feature = "qcow")]
pub use qcow::QcowFile;
#[cfg(feature = "qcow")]
pub use qcow::QCOW_MAGIC;
mod sys;

#[cfg(feature = "composite-disk")]
mod composite;
#[cfg(feature = "composite-disk")]
use composite::CompositeDiskFile;
#[cfg(feature = "composite-disk")]
use composite::CDISK_MAGIC;
#[cfg(feature = "composite-disk")]
mod gpt;
#[cfg(feature = "composite-disk")]
pub use composite::create_composite_disk;
#[cfg(feature = "composite-disk")]
pub use composite::create_zero_filler;
#[cfg(feature = "composite-disk")]
pub use composite::Error as CompositeError;
#[cfg(feature = "composite-disk")]
pub use composite::ImagePartitionType;
#[cfg(feature = "composite-disk")]
pub use composite::PartitionInfo;
#[cfg(feature = "composite-disk")]
pub use gpt::Error as GptError;

#[cfg(feature = "android-sparse")]
mod android_sparse;
#[cfg(feature = "android-sparse")]
use android_sparse::AndroidSparse;
#[cfg(feature = "android-sparse")]
use android_sparse::SPARSE_HEADER_MAGIC;
use sys::read_from_disk;

#[cfg(feature = "zstd")]
mod zstd;
#[cfg(feature = "zstd")]
use zstd::ZstdDisk;
#[cfg(feature = "zstd")]
use zstd::ZSTD_FRAME_MAGIC;
#[cfg(feature = "zstd")]
use zstd::ZSTD_SKIPPABLE_MAGIC_HIGH;
#[cfg(feature = "zstd")]
use zstd::ZSTD_SKIPPABLE_MAGIC_LOW;

/// Nesting depth limit for disk formats that can open other disk files.
const MAX_NESTING_DEPTH: u32 = 10;

#[derive(ThisError, Debug)]
pub enum Error {
    #[error("failed to create block device: {0}")]
    BlockDeviceNew(base::Error),
    #[error("requested file conversion not supported")]
    ConversionNotSupported,
    #[cfg(feature = "android-sparse")]
    #[error("failure in android sparse disk: {0}")]
    CreateAndroidSparseDisk(android_sparse::Error),
    #[cfg(feature = "composite-disk")]
    #[error("failure in composite disk: {0}")]
    CreateCompositeDisk(composite::Error),
    #[cfg(feature = "zstd")]
    #[error("failure in zstd disk: {0}")]
    CreateZstdDisk(anyhow::Error),
    #[error("failure creating single file disk: {0}")]
    CreateSingleFileDisk(cros_async::AsyncError),
    #[error("failed to set O_DIRECT on disk image: {0}")]
    DirectFailed(base::Error),
    #[error("failure with fdatasync: {0}")]
    Fdatasync(cros_async::AsyncError),
    #[error("failure with fsync: {0}")]
    Fsync(cros_async::AsyncError),
    #[error("failed to lock file: {0}")]
    LockFileFailure(base::Error),
    #[error("failure with fdatasync: {0}")]
    IoFdatasync(io::Error),
    #[error("failure with flush: {0}")]
    IoFlush(io::Error),
    #[error("failure with fsync: {0}")]
    IoFsync(io::Error),
    #[error("failure to punch hole: {0}")]
    IoPunchHole(io::Error),
    #[error("checking host fs type: {0}")]
    HostFsType(base::Error),
    #[error("maximum disk nesting depth exceeded")]
    MaxNestingDepthExceeded,
    #[error("failed to open disk file \"{0}\": {1}")]
    OpenFile(String, base::Error),
    #[error("failure to punch hole: {0}")]
    PunchHole(cros_async::AsyncError),
    #[error("failure to punch hole for block device file: {0}")]
    PunchHoleBlockDeviceFile(base::Error),
    #[cfg(feature = "qcow")]
    #[error("failure in qcow: {0}")]
    QcowError(qcow::Error),
    #[error("failed to read data: {0}")]
    ReadingData(io::Error),
    #[error("failed to read header: {0}")]
    ReadingHeader(io::Error),
    #[error("failed to read to memory: {0}")]
    ReadToMem(cros_async::AsyncError),
    #[error("failed to seek file: {0}")]
    SeekingFile(io::Error),
    #[error("failed to set file size: {0}")]
    SettingFileSize(io::Error),
    #[error("unknown disk type")]
    UnknownType,
    #[error("failed to write from memory: {0}")]
    WriteFromMem(cros_async::AsyncError),
    #[error("failed to write from vec: {0}")]
    WriteFromVec(cros_async::AsyncError),
    #[error("failed to write zeroes: {0}")]
    WriteZeroes(io::Error),
    #[error("failed to write data: {0}")]
    WritingData(io::Error),
    #[error("failed to convert to async: {0}")]
    ToAsync(cros_async::AsyncError),
    #[cfg(windows)]
    #[error("failed to set disk file sparse: {0}")]
    SetSparseFailure(io::Error),
    #[error("failure with guest memory access: {0}")]
    GuestMemory(cros_async::mem::Error),
    #[error("unsupported operation")]
    UnsupportedOperation,
}

pub type Result<T> = std::result::Result<T, Error>;

/// A trait for getting the length of a disk image or raw block device.
pub trait DiskGetLen {
    /// Get the current length of the disk in bytes.
    fn get_len(&self) -> io::Result<u64>;
}

impl DiskGetLen for File {
    fn get_len(&self) -> io::Result<u64> {
        let mut s = self;
        let orig_seek = s.stream_position()?;
        let end = s.seek(SeekFrom::End(0))?;
        s.seek(SeekFrom::Start(orig_seek))?;
        Ok(end)
    }
}

/// The prerequisites necessary to support a block device.
pub trait DiskFile:
    FileSetLen + DiskGetLen + FileReadWriteAtVolatile + ToAsyncDisk + Send + AsRawDescriptors + Debug
{
    /// Creates a new DiskFile instance that shares the same underlying disk file image. IO
    /// operations to a DiskFile should affect all DiskFile instances with the same underlying disk
    /// file image.
    ///
    /// `try_clone()` returns [`io::ErrorKind::Unsupported`] Error if a DiskFile does not support
    /// creating an instance with the same underlying disk file image.
    fn try_clone(&self) -> io::Result<Box<dyn DiskFile>> {
        Err(io::Error::new(
            io::ErrorKind::Unsupported,
            "unsupported operation",
        ))
    }
}

/// A `DiskFile` that can be converted for asychronous access.
pub trait ToAsyncDisk: AsRawDescriptors + DiskGetLen + Send {
    /// Convert a boxed self in to a box-wrapped implementaiton of AsyncDisk.
    /// Used to convert a standard disk image to an async disk image. This conversion and the
    /// inverse are needed so that the `Send` DiskImage can be given to the block thread where it is
    /// converted to a non-`Send` AsyncDisk. The AsyncDisk can then be converted back and returned
    /// to the main device thread if the block device is destroyed or reset.
    fn to_async_disk(self: Box<Self>, ex: &Executor) -> Result<Box<dyn AsyncDisk>>;
}

impl ToAsyncDisk for File {
    fn to_async_disk(self: Box<Self>, ex: &Executor) -> Result<Box<dyn AsyncDisk>> {
        Ok(Box::new(SingleFileDisk::new(*self, ex)?))
    }
}

/// The variants of image files on the host that can be used as virtual disks.
#[derive(Debug, PartialEq, Eq)]
pub enum ImageType {
    Raw,
    Qcow2,
    CompositeDisk,
    AndroidSparse,
    Zstd,
}

/// Detect the type of an image file by checking for a valid header of the supported formats.
pub fn detect_image_type(file: &File, overlapped_mode: bool) -> Result<ImageType> {
    let mut f = file;
    let disk_size = f.get_len().map_err(Error::SeekingFile)?;
    let orig_seek = f.stream_position().map_err(Error::SeekingFile)?;

    info!("disk size {}", disk_size);

    // Try to read the disk in a nicely-aligned block size unless the whole file is smaller.
    const MAGIC_BLOCK_SIZE: usize = 4096;
    #[repr(align(4096))]
    struct BlockAlignedBuffer {
        data: [u8; MAGIC_BLOCK_SIZE],
    }
    let mut magic = BlockAlignedBuffer {
        data: [0u8; MAGIC_BLOCK_SIZE],
    };
    let magic_read_len = if disk_size > MAGIC_BLOCK_SIZE as u64 {
        MAGIC_BLOCK_SIZE
    } else {
        // This cast is safe since we know disk_size is less than MAGIC_BLOCK_SIZE (4096) and
        // therefore is representable in usize.
        disk_size as usize
    };

    read_from_disk(f, 0, &mut magic.data[0..magic_read_len], overlapped_mode)?;
    f.seek(SeekFrom::Start(orig_seek))
        .map_err(Error::SeekingFile)?;

    #[cfg(feature = "composite-disk")]
    if let Some(cdisk_magic) = magic.data.get(0..CDISK_MAGIC.len()) {
        if cdisk_magic == CDISK_MAGIC.as_bytes() {
            return Ok(ImageType::CompositeDisk);
        }
    }

    #[allow(unused_variables)] // magic4 is only used with the qcow/android-sparse/zstd features.
    if let Some(magic4) = magic
        .data
        .get(0..4)
        .and_then(|v| <&[u8] as std::convert::TryInto<[u8; 4]>>::try_into(v).ok())
    {
        #[cfg(feature = "qcow")]
        if magic4 == QCOW_MAGIC.to_be_bytes() {
            return Ok(ImageType::Qcow2);
        }
        #[cfg(feature = "android-sparse")]
        if magic4 == SPARSE_HEADER_MAGIC.to_le_bytes() {
            return Ok(ImageType::AndroidSparse);
        }
        #[cfg(feature = "zstd")]
        if u32::from_le_bytes(magic4) == ZSTD_FRAME_MAGIC
            || (u32::from_le_bytes(magic4) >= ZSTD_SKIPPABLE_MAGIC_LOW
                && u32::from_le_bytes(magic4) <= ZSTD_SKIPPABLE_MAGIC_HIGH)
        {
            return Ok(ImageType::Zstd);
        }
    }

    Ok(ImageType::Raw)
}

impl DiskFile for File {
    fn try_clone(&self) -> io::Result<Box<dyn DiskFile>> {
        Ok(Box::new(self.try_clone()?))
    }
}

pub struct DiskFileParams {
    pub path: PathBuf,
    pub is_read_only: bool,
    // Whether to call `base::set_sparse_file` on the file. Currently only affects Windows and is
    // irrelevant for read only files.
    pub is_sparse_file: bool,
    // Whether to open the file in overlapped mode. Only affects Windows.
    pub is_overlapped: bool,
    // Whether to disable OS page caches / buffering.
    pub is_direct: bool,
    // Whether to lock the file.
    pub lock: bool,
    // The nesting depth of the file. Used to avoid infinite recursion. Users outside the disk
    // crate should set this to zero.
    pub depth: u32,
}

/// Inspect the image file type and create an appropriate disk file to match it.
pub fn open_disk_file(params: DiskFileParams) -> Result<Box<dyn DiskFile>> {
    if params.depth > MAX_NESTING_DEPTH {
        return Err(Error::MaxNestingDepthExceeded);
    }

    let raw_image = sys::open_raw_disk_image(&params)?;
    let image_type = detect_image_type(&raw_image, params.is_overlapped)?;
    Ok(match image_type {
        ImageType::Raw => {
            sys::apply_raw_disk_file_options(&raw_image, params.is_sparse_file)?;
            Box::new(raw_image) as Box<dyn DiskFile>
        }
        #[cfg(feature = "qcow")]
        ImageType::Qcow2 => Box::new(QcowFile::from(raw_image, params).map_err(Error::QcowError)?)
            as Box<dyn DiskFile>,
        #[cfg(feature = "composite-disk")]
        ImageType::CompositeDisk => {
            // Valid composite disk header present
            Box::new(
                CompositeDiskFile::from_file(raw_image, params)
                    .map_err(Error::CreateCompositeDisk)?,
            ) as Box<dyn DiskFile>
        }
        #[cfg(feature = "android-sparse")]
        ImageType::AndroidSparse => {
            Box::new(AndroidSparse::from_file(raw_image).map_err(Error::CreateAndroidSparseDisk)?)
                as Box<dyn DiskFile>
        }
        #[cfg(feature = "zstd")]
        ImageType::Zstd => Box::new(ZstdDisk::from_file(raw_image).map_err(Error::CreateZstdDisk)?)
            as Box<dyn DiskFile>,
        #[allow(unreachable_patterns)]
        _ => return Err(Error::UnknownType),
    })
}

/// An asynchronously accessible disk.
#[async_trait(?Send)]
pub trait AsyncDisk: DiskGetLen + FileSetLen + FileAllocate {
    /// Flush intermediary buffers and/or dirty state to file. fsync not required.
    async fn flush(&self) -> Result<()>;

    /// Asynchronously fsyncs any completed operations to the disk.
    async fn fsync(&self) -> Result<()>;

    /// Asynchronously fdatasyncs any completed operations to the disk.
    /// Note that an implementation may simply call fsync for fdatasync.
    async fn fdatasync(&self) -> Result<()>;

    /// Reads from the file at 'file_offset' into memory `mem` at `mem_offsets`.
    /// `mem_offsets` is similar to an iovec except relative to the start of `mem`.
    async fn read_to_mem<'a>(
        &'a self,
        file_offset: u64,
        mem: Arc<dyn BackingMemory + Send + Sync>,
        mem_offsets: cros_async::MemRegionIter<'a>,
    ) -> Result<usize>;

    /// Writes to the file at 'file_offset' from memory `mem` at `mem_offsets`.
    async fn write_from_mem<'a>(
        &'a self,
        file_offset: u64,
        mem: Arc<dyn BackingMemory + Send + Sync>,
        mem_offsets: cros_async::MemRegionIter<'a>,
    ) -> Result<usize>;

    /// Replaces a range of bytes with a hole.
    async fn punch_hole(&self, file_offset: u64, length: u64) -> Result<()>;

    /// Writes up to `length` bytes of zeroes to the stream, returning how many bytes were written.
    async fn write_zeroes_at(&self, file_offset: u64, length: u64) -> Result<()>;

    /// Reads from the file at 'file_offset' into `buf`.
    ///
    /// Less efficient than `read_to_mem` because of extra copies and allocations.
    async fn read_double_buffered(&self, file_offset: u64, buf: &mut [u8]) -> Result<usize> {
        let backing_mem = Arc::new(cros_async::VecIoWrapper::from(vec![0u8; buf.len()]));
        let region = cros_async::MemRegion {
            offset: 0,
            len: buf.len(),
        };
        let n = self
            .read_to_mem(
                file_offset,
                backing_mem.clone(),
                MemRegionIter::new(&[region]),
            )
            .await?;
        backing_mem
            .get_volatile_slice(region)
            .expect("BUG: the VecIoWrapper shrank?")
            .sub_slice(0, n)
            .expect("BUG: read_to_mem return value too large?")
            .copy_to(buf);
        Ok(n)
    }

    /// Writes to the file at 'file_offset' from `buf`.
    ///
    /// Less efficient than `write_from_mem` because of extra copies and allocations.
    async fn write_double_buffered(&self, file_offset: u64, buf: &[u8]) -> Result<usize> {
        let backing_mem = Arc::new(cros_async::VecIoWrapper::from(buf.to_vec()));
        let region = cros_async::MemRegion {
            offset: 0,
            len: buf.len(),
        };
        self.write_from_mem(
            file_offset,
            backing_mem,
            cros_async::MemRegionIter::new(&[region]),
        )
        .await
    }
}

/// A disk backed by a single file that implements `AsyncDisk` for access.
pub struct SingleFileDisk {
    inner: IoSource<File>,
    // Whether the backed file is a block device since the punch-hole needs different operation.
    #[cfg(any(target_os = "android", target_os = "linux"))]
    is_block_device_file: bool,
}

impl DiskGetLen for SingleFileDisk {
    fn get_len(&self) -> io::Result<u64> {
        self.inner.as_source().get_len()
    }
}

impl FileSetLen for SingleFileDisk {
    fn set_len(&self, len: u64) -> io::Result<()> {
        self.inner.as_source().set_len(len)
    }
}

impl FileAllocate for SingleFileDisk {
    fn allocate(&self, offset: u64, len: u64) -> io::Result<()> {
        self.inner.as_source().allocate(offset, len)
    }
}

#[async_trait(?Send)]
impl AsyncDisk for SingleFileDisk {
    async fn flush(&self) -> Result<()> {
        // Nothing to flush, all file mutations are immediately sent to the OS.
        Ok(())
    }

    async fn fsync(&self) -> Result<()> {
        self.inner.fsync().await.map_err(Error::Fsync)
    }

    async fn fdatasync(&self) -> Result<()> {
        self.inner.fdatasync().await.map_err(Error::Fdatasync)
    }

    async fn read_to_mem<'a>(
        &'a self,
        file_offset: u64,
        mem: Arc<dyn BackingMemory + Send + Sync>,
        mem_offsets: cros_async::MemRegionIter<'a>,
    ) -> Result<usize> {
        self.inner
            .read_to_mem(Some(file_offset), mem, mem_offsets)
            .await
            .map_err(Error::ReadToMem)
    }

    async fn write_from_mem<'a>(
        &'a self,
        file_offset: u64,
        mem: Arc<dyn BackingMemory + Send + Sync>,
        mem_offsets: cros_async::MemRegionIter<'a>,
    ) -> Result<usize> {
        self.inner
            .write_from_mem(Some(file_offset), mem, mem_offsets)
            .await
            .map_err(Error::WriteFromMem)
    }

    async fn punch_hole(&self, file_offset: u64, length: u64) -> Result<()> {
        #[cfg(any(target_os = "android", target_os = "linux"))]
        if self.is_block_device_file {
            return base::linux::discard_block(self.inner.as_source(), file_offset, length)
                .map_err(Error::PunchHoleBlockDeviceFile);
        }
        self.inner
            .punch_hole(file_offset, length)
            .await
            .map_err(Error::PunchHole)
    }

    async fn write_zeroes_at(&self, file_offset: u64, length: u64) -> Result<()> {
        if self
            .inner
            .write_zeroes_at(file_offset, length)
            .await
            .is_ok()
        {
            return Ok(());
        }

        // Fall back to filling zeros if more efficient write_zeroes_at doesn't work.
        let buf_size = min(length, 0x10000);
        let mut nwritten = 0;
        while nwritten < length {
            let remaining = length - nwritten;
            let write_size = min(remaining, buf_size) as usize;
            let buf = vec![0u8; write_size];
            nwritten += self
                .inner
                .write_from_vec(Some(file_offset + nwritten), buf)
                .await
                .map(|(n, _)| n as u64)
                .map_err(Error::WriteFromVec)?;
        }
        Ok(())
    }
}