diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Cargo.lock | 104 | ||||
-rw-r--r-- | Cargo.toml | 9 | ||||
-rw-r--r-- | LICENSE | 21 | ||||
-rw-r--r-- | README.md | 91 | ||||
-rw-r--r-- | src/lib.rs | 184 | ||||
-rw-r--r-- | src/main.rs | 144 |
7 files changed, 554 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..bdbb878 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,104 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "goblin" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b1800b95efee8ad4ef04517d4d69f8e209e763b1668f1179aeeedd0e454da55" +dependencies = [ + "log", + "plain", + "scroll", +] + +[[package]] +name = "libc" +version = "0.2.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790" + +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + +[[package]] +name = "proc-macro2" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "scroll" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fda28d4b4830b807a8b43f7b0e6b5df875311b3e7621d84577188c175b6ec1ec" +dependencies = [ + "scroll_derive", +] + +[[package]] +name = "scroll_derive" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aaaae8f38bb311444cfb7f1979af0bc9240d95795f75f9ceddf6a59b79ceffa0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "1.0.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1873d832550d4588c3dbc20f01361ab00bfe741048f71e3fecf145a7cc18b29c" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "vdso-proxy-poc" +version = "0.1.0" +dependencies = [ + "goblin", + "libc", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d7eb41d --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "vdso-proxy-poc" +authors = ["johannst <johannes.stoelp@gmail.com>"] +version = "0.1.0" +edition = "2018" + +[dependencies] +libc = "0.2" +goblin = { version = "0.4", default-features = false, features = ["std", "elf32", "elf64", "endian_fd"] } @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Johannes Stoelp + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..34afd4b --- /dev/null +++ b/README.md @@ -0,0 +1,91 @@ +# `vdso` proxy proof-of-concept + +## Background +Simply spoken, the `vdso` is an ELF file provided by the Linux Kernel and +mapped into a process to provide the implementation of certain `syscalls` in +userspace. Userspace can call those `virtual` syscalls without invoking a +_real_ syscall (eg on x86-64 `syscall` instruction). + +The location where the Kernel mapped the `vdso` can be found in the `maps` +(procfs) labeled with the `[vdso]` tag. +```bash +> cat /proc/self/maps | grep vdso +7ffeae5fb000-7ffeae5fd000 r-xp 00000000 00:00 0 [vdso] +``` + +More details about the `vdso` can be found here: +- https://man7.org/linux/man-pages/man7/vdso.7.html +- https://www.kernel.org/doc/Documentation/ABI/stable/vdso + +## Why do this? +This is some toying around and proof-of-concept for `process-checkpoint` +scenarios with `migration` in mind. +Typically a process checkpoint contains a dump of the virtual memory regions of +a process which are then re-mapped when restoring the process at a later point +in time. The vdso in this case needs some special treatment as the user code in +the checkpoint image might have some references into the vdso segment (usually +this is done behind the scenes by the `libc`) where it was when taking the +checkpoint . +When restoring a checkpoint, the Kernel will map the `vdso` to a random virtual +address in the restoring process, therfore there are two cases to distinguish: +1. Restoring the checkpoint with the same Kernel. +1. Restoring the checkpoint with a different Kernel (`migration`). + +For case `(1)` the `vdso` can be [`mremap(2)`][man-mremap]-ed to the virtual +address where the vdso resided when creating the checkpoint. This is fine +because the _new_ and the _old_ `vdso` are compatible. + +For case `(2)` however it is possible that the binary layout of the _new_ +`vdso` has changed (eg different offsets for a given symbol) and is therefore +incompatible with the _old_ `vdso`. In that case a simple +[`mremap(2)`][man-mremap] won't do the trick. +This case is explored in this repository with a `proxy` mechanism which is +described by the figure below. + +```text +# Before checkpoint create. + + VMA + +---------------------+ + | libc: | + | gettimeofday(...) | + | .. | + | call | --+ + | .. | | User code binds to symbols in the vdso. +eg +-- +---------------------+ | ++0x10 | | vdso: | | + +-> | __vdso_gettimeofday | <-+ + | .. | + +---------------------+ + + +# After checkpoint restore. + + VMA + +---------------------+ + | libc: | + | gettimeofday(...) | + | .. | + | call | --+ + | .. | | After restoring the memory of the process checkpoint, +eg +-- +---------------------+ | user code still binds to symbols in the _old_ vdso region. ++0x10 | | [old] vdso: | | + +-> | __vdso_gettimeofday | <-+ + | jmp | --+ + | .. | | After restore, the functions in the _old_ vdso region +eg +-- +---------------------+ | are patched with a trampoline forwarding to the ++0x40 | | [new] vdso: | | corresponding function in the _new_ vdso region. + +-> | __vdso_gettimeofday | <-+ + | .. | + +---------------------+ +``` + +This approach introduces the need for a higher-level synchronization as it must +be ensured that no thread is in the middle of executing a `vdso` function when +creating the process checkpoint. This PoC doesn't take this into account as it +merely focuses on the mechanics described above. + +## License +This project is licensed under the [MIT](LICENSE) license. + +[man-mremap]: https://man7.org/linux/man-pages/man2/mremap.2.html diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..18c1b74 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,184 @@ +//! Collection of some abstractions and the error definition used by this PoC. + +/// Errors that can occur in this PoC. +#[derive(Debug)] +pub enum Error { + /// Failed to open and read the `/proc/self/maps` file. + FailedToReadMaps, + /// Failed to a parse line from the `/proc/self/maps` file. + /// Captures line that failed to parse. + ParseMapEntryError(String), + /// No `[vdso]` segment found in `/proc/self/maps`. + VdsoSegmentNotFound, + /// Failed to parse bytes as ELF file. + FailedToParseAsElf, + /// No `PT_LOAD` program header found in the ELF file. + LoadPhdrNotFound, + /// Requested symbol not found in the ELF file. + /// Captures the name of the requested symbol. + SymbolNotFound(String), +} + +/// Representation of an 64-bit virtual address. +#[derive(Debug, Copy, Clone)] +pub struct VirtAddr(pub u64); + +/// Represents an entry from the `/proc/self/maps` file with the information needed by this PoC. +/// ```text +/// man 5 proc +/// address perms offset dev inode pathname +/// 00400000-00452000 r-xp 00000000 08:02 173521 /usr/bin/dbus-daemon +/// ``` +#[derive(Debug)] +pub struct MapEntry { + /// Start address of the memory segment. + pub addr: u64, + /// Length of the memory segment. + pub len: u64, + /// Optional name of the memory segment. + pub name: Option<String>, +} + +impl MapEntry { + /// Try to parse a [`MapEntry`] from the `line` passed as argument. + pub fn from_line<'a>(line: &'a str) -> Result<MapEntry, Error> { + let expect = |tok: Option<&'a str>| tok.ok_or(Error::ParseMapEntryError(line.into())); + + // Tokenize the line. + let mut toks = line.split_whitespace(); + let addr = expect(toks.next())?; + let _perms = expect(toks.next())?; + let _offset = expect(toks.next())?; + let _dev = expect(toks.next())?; + let _inode = expect(toks.next())?; + let name = toks.next().map(|name| String::from(name)); + + // Parse the address token. + let (addr, len) = { + let tou64 = |s: &'a str| { + u64::from_str_radix(s, 16) + .map_err(|e| Error::ParseMapEntryError(format!("{}\n{}", line, e))) + }; + + let mut toks = addr.split('-'); + let start = tou64(expect(toks.next())?)?; + let end = tou64(expect(toks.next())?)?; + + (start, end - start) + }; + + Ok(MapEntry { addr, len, name }) + } +} + +/// Owned [`libc::mmap`] allocation. +pub struct Mmap { + ptr: *mut libc::c_void, + len: usize, + map: MapEntry, +} + +impl Mmap { + /// Create a new allocation with `read | write | execute` permissions big enough to hold a copy + /// of `bytes` and initialize it with the `bytes` passed as argument. + pub fn new_rwx_from(bytes: &[u8]) -> Option<Mmap> { + use libc::{ + memcpy, mmap, sysconf, MAP_ANONYMOUS, MAP_FAILED, MAP_PRIVATE, PROT_EXEC, PROT_READ, + PROT_WRITE, _SC_PAGESIZE, + }; + + // Get the page size. + let page_size = unsafe { sysconf(_SC_PAGESIZE) } as usize; + + // Compute required size for the new allocation by rounding up to the next page size. + let len = ((bytes.len() + page_size - 1) / page_size) * page_size; + + // Allocate new `rwx` memory segment. + let ptr = unsafe { + mmap( + std::ptr::null_mut(), + len, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, /* fd */ + 0, /* offset */ + ) + }; + + if ptr == MAP_FAILED { + return None; + } + + unsafe { + // Initialize new allocation with `bytes` passed as argument. + memcpy(ptr, bytes.as_ptr().cast(), bytes.len()); + } + + Some(Mmap { + ptr, + len, + map: MapEntry { + addr: ptr as u64, + len: len as u64, + name: Some("mmap_rwx".into()), + }, + }) + } +} + +impl Drop for Mmap { + fn drop(&mut self) { + unsafe { libc::munmap(self.ptr, self.len) }; + } +} + +impl AsRef<MapEntry> for Mmap { + fn as_ref(&self) -> &'_ MapEntry { + &self.map + } +} + +impl AsMut<[u8]> for Mmap { + fn as_mut(&mut self) -> &mut [u8] { + unsafe { std::slice::from_raw_parts_mut(self.ptr.cast(), self.len) } + } +} + +/// An `x86_64` jump pad (trampoline) that that can be installed at a [`VirtAddr`]. +/// +/// The jump pad is implemented as: +/// ```asm +/// mov rax, imm64 ; target +/// jmp rax +/// ``` +#[allow(dead_code)] +#[cfg(target_arch = "x86_64")] +#[repr(packed)] +pub struct JmpPad { + movabs: u16, + target: u64, + jmp_rax: u16, +} + +#[cfg(target_arch = "x86_64")] +impl JmpPad { + /// Initialize a new jump pad to the destination virtual address `target`. + /// This does not install the jump pad. + pub fn to(target: VirtAddr) -> JmpPad { + JmpPad { + movabs: 0xb848, // REX.W + mov rax, imm64 + target: target.0, + jmp_rax: 0xe0ff, // jmp rax + } + } + + /// Install the jump pad at the virtual address `addr`. + /// + /// # Safety + /// The caller must guarantee the following constraints: + /// - `addr` must be a valid virtual address referring to writeable memory. + /// - There must be enough space to store [`size_of::<JmpPad>()`](core::mem::size_of) bytes. + pub unsafe fn install_at(self, addr: VirtAddr) { + std::ptr::write(addr.0 as *mut JmpPad, self); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..400b2d7 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,144 @@ +use goblin::elf::{Elf, program_header::PT_LOAD}; +use std::convert::TryFrom; +use vdso_proxy_poc::{Error, JmpPad, MapEntry, Mmap, VirtAddr}; + +#[cfg(not(target_os = "linux"))] +compile_error!("This only makes sense on Linux, as we are poking the vdso."); + +/// Find the `[vdso]` entry in `/proc/self/maps`. +fn get_vdso() -> Result<MapEntry, Error> { + for line in std::fs::read_to_string("/proc/self/maps") + .map_err(|_| Error::FailedToReadMaps)? + .lines() + { + let map = MapEntry::from_line(line)?; + match &map.name { + Some(n) if n == "[vdso]" => return Ok(map), + _ => {} + } + } + Err(Error::VdsoSegmentNotFound) +} + +/// Create a copy of the `vdso` memory segment. Effectively allocates memory and copies the virtual +/// address range described by `vdso`. +/// +/// # Safety: +/// The caller must guarantee that the `vdso` argument describes a valid virtual address range by +/// its `address` and `length` fields. +#[allow(unused_unsafe)] +unsafe fn copy_vdso(vdso: &MapEntry) -> Option<Mmap> { + let bytes = { + let ptr = vdso.addr as *const u8; + let len = usize::try_from(vdso.len) + .expect("It's required that the segment length fits into a usize!"); + // SAFETY: Validity of ptr & len must be ensured by the caller. + unsafe { std::slice::from_raw_parts(ptr, len) } + }; + + Mmap::new_rwx_from(&bytes) +} + +/// Find the `symbol_name` in the vdso described by the [`MapEntry`] memory segment. +/// +/// # Safety: +/// The caller must guarantee that the `vdso` argument describes a valid virtual address range by +/// its `address` and `length` fields. +/// +/// # Note: +/// Currently the version of the symbol is not checked, technically this is an error which can be +/// fatal in case of a binary incompatibility, but that's accepted for this PoC. +#[allow(unused_unsafe)] +unsafe fn get_vdso_sym(vdso: &MapEntry, symbol_name: &str) -> Result<VirtAddr, Error> { + // Turn `vdso` maps entry into slice of bytes. + let bytes = { + let ptr = vdso.addr as *const u8; + let len = usize::try_from(vdso.len) + .expect("It's required that the segment length fits into a usize!"); + // SAFETY: Validity of ptr & len must be ensured by the caller. + unsafe { std::slice::from_raw_parts(ptr, len) } + }; + + // Parse vdso bytes as ELF. + let elf = Elf::parse(bytes).map_err(|_| Error::FailedToParseAsElf)?; + + // Compute the dynamic shared object (dso) base address. Symbol offsets are relative to this + // dso base address. + let dso_base = { + let phdr_load = elf + .program_headers + .iter() + .find(|p| p.p_type == PT_LOAD) + .ok_or(Error::LoadPhdrNotFound)?; + vdso.addr - phdr_load.p_offset - phdr_load.p_vaddr + }; + assert_ne!(dso_base, 0, "If the dso base address is 0 that means the symbols contain absolute addresses, we don't want to support that!"); + + // Try to find the requested symbol. + let sym = elf + .dynsyms + .iter() + .filter(|sym| sym.is_function()) + .find(|sym| matches!(elf.dynstrtab.get_at(sym.st_name), Some(sym) if sym == symbol_name)) + .ok_or(Error::SymbolNotFound(symbol_name.into()))?; + + // Compute the absolute virtual address of the requested symbol. + Ok(VirtAddr(dso_base + sym.st_value)) +} + +/// Represent the `struct timeval` C structure (see `man 2 gettimeofday`). +#[repr(C)] +struct Timeval { + tv_sec: i64, + tv_usec: i64, +} + +fn main() -> Result<(), Error> { + // This represents the _new_ vdso pages that the kernel mapped into the restoring process. + let orig_vdso = get_vdso()?; + + // This represents the _old_ vdso pages that were captured in the memory dump of the process + // checkpoint. + // + // SAFETY: orig_vdso describes a valid memory region as we got it from /proc/self/maps. + let copy_vdso = unsafe { copy_vdso(&orig_vdso).expect("Copy of vdso must succeed!") }; + + let (orig_sym_addr, copy_sym_addr) = unsafe { + // SAFETY: orig_vdso describes a valid memory region as we got it from /proc/self/maps. + let orig = get_vdso_sym(&orig_vdso, "__vdso_gettimeofday")?; + // SAFETY: copy_vdso describes a valid and owned memory allocation. + let copy = get_vdso_sym(©_vdso.as_ref(), "__vdso_gettimeofday")?; + + (orig, copy) + }; + + // As an example, install a trampoline for the `__vdso_gettimeofday` symbol. The trampoline is + // installed in the _old_ vdso pages, where the user code from the checkpoint image links to, + // and forwards the calls into the _new_ vdso pages. + let pad = JmpPad::to(orig_sym_addr); + // SAFETY: copy_sym_addr is a valid virtual address as we got it from the symbol lookup. + unsafe { pad.install_at(copy_sym_addr) }; + + let mut tv: Timeval = Timeval { + tv_sec: 0, + tv_usec: 0, + }; + + unsafe { + // Mimic a call to `__vdso_gettimeofday` from user code which is still linked to the _old_ + // vdso. + + // SAFETY: copy_sym_addr is a valid virtual address pointing to the `__vdso_gettimeofday` + // function. + let gettimeofday: extern "C" fn(*mut Timeval, *mut libc::c_void) -> i32 = + std::mem::transmute(copy_sym_addr.0 as *const ()); + + // Invoke the `__vdso_gettimeofday` function in the copied memory region (_old_ vdso). This + // should forward to the function in the original memory region. + gettimeofday(&mut tv as *mut Timeval, std::ptr::null_mut()); + } + + println!("Timeval tv_sec : {} tv_usec : {}", tv.tv_sec, tv.tv_usec); + + Ok(()) +} |