aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--Cargo.lock104
-rw-r--r--Cargo.toml9
-rw-r--r--LICENSE21
-rw-r--r--README.md91
-rw-r--r--src/lib.rs184
-rw-r--r--src/main.rs144
7 files changed, 554 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ea8c4bf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..bdbb878
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,104 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "goblin"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b1800b95efee8ad4ef04517d4d69f8e209e763b1668f1179aeeedd0e454da55"
+dependencies = [
+ "log",
+ "plain",
+ "scroll",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.98"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790"
+
+[[package]]
+name = "log"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612"
+dependencies = [
+ "unicode-xid",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "scroll"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fda28d4b4830b807a8b43f7b0e6b5df875311b3e7621d84577188c175b6ec1ec"
+dependencies = [
+ "scroll_derive",
+]
+
+[[package]]
+name = "scroll_derive"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aaaae8f38bb311444cfb7f1979af0bc9240d95795f75f9ceddf6a59b79ceffa0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "syn"
+version = "1.0.74"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1873d832550d4588c3dbc20f01361ab00bfe741048f71e3fecf145a7cc18b29c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-xid",
+]
+
+[[package]]
+name = "unicode-xid"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
+
+[[package]]
+name = "vdso-proxy-poc"
+version = "0.1.0"
+dependencies = [
+ "goblin",
+ "libc",
+]
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..d7eb41d
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "vdso-proxy-poc"
+authors = ["johannst <johannes.stoelp@gmail.com>"]
+version = "0.1.0"
+edition = "2018"
+
+[dependencies]
+libc = "0.2"
+goblin = { version = "0.4", default-features = false, features = ["std", "elf32", "elf64", "endian_fd"] }
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6144be6
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Johannes Stoelp
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..34afd4b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,91 @@
+# `vdso` proxy proof-of-concept
+
+## Background
+Simply spoken, the `vdso` is an ELF file provided by the Linux Kernel and
+mapped into a process to provide the implementation of certain `syscalls` in
+userspace. Userspace can call those `virtual` syscalls without invoking a
+_real_ syscall (eg on x86-64 `syscall` instruction).
+
+The location where the Kernel mapped the `vdso` can be found in the `maps`
+(procfs) labeled with the `[vdso]` tag.
+```bash
+> cat /proc/self/maps | grep vdso
+7ffeae5fb000-7ffeae5fd000 r-xp 00000000 00:00 0 [vdso]
+```
+
+More details about the `vdso` can be found here:
+- https://man7.org/linux/man-pages/man7/vdso.7.html
+- https://www.kernel.org/doc/Documentation/ABI/stable/vdso
+
+## Why do this?
+This is some toying around and proof-of-concept for `process-checkpoint`
+scenarios with `migration` in mind.
+Typically a process checkpoint contains a dump of the virtual memory regions of
+a process which are then re-mapped when restoring the process at a later point
+in time. The vdso in this case needs some special treatment as the user code in
+the checkpoint image might have some references into the vdso segment (usually
+this is done behind the scenes by the `libc`) where it was when taking the
+checkpoint .
+When restoring a checkpoint, the Kernel will map the `vdso` to a random virtual
+address in the restoring process, therfore there are two cases to distinguish:
+1. Restoring the checkpoint with the same Kernel.
+1. Restoring the checkpoint with a different Kernel (`migration`).
+
+For case `(1)` the `vdso` can be [`mremap(2)`][man-mremap]-ed to the virtual
+address where the vdso resided when creating the checkpoint. This is fine
+because the _new_ and the _old_ `vdso` are compatible.
+
+For case `(2)` however it is possible that the binary layout of the _new_
+`vdso` has changed (eg different offsets for a given symbol) and is therefore
+incompatible with the _old_ `vdso`. In that case a simple
+[`mremap(2)`][man-mremap] won't do the trick.
+This case is explored in this repository with a `proxy` mechanism which is
+described by the figure below.
+
+```text
+# Before checkpoint create.
+
+ VMA
+ +---------------------+
+ | libc: |
+ | gettimeofday(...) |
+ | .. |
+ | call | --+
+ | .. | | User code binds to symbols in the vdso.
+eg +-- +---------------------+ |
++0x10 | | vdso: | |
+ +-> | __vdso_gettimeofday | <-+
+ | .. |
+ +---------------------+
+
+
+# After checkpoint restore.
+
+ VMA
+ +---------------------+
+ | libc: |
+ | gettimeofday(...) |
+ | .. |
+ | call | --+
+ | .. | | After restoring the memory of the process checkpoint,
+eg +-- +---------------------+ | user code still binds to symbols in the _old_ vdso region.
++0x10 | | [old] vdso: | |
+ +-> | __vdso_gettimeofday | <-+
+ | jmp | --+
+ | .. | | After restore, the functions in the _old_ vdso region
+eg +-- +---------------------+ | are patched with a trampoline forwarding to the
++0x40 | | [new] vdso: | | corresponding function in the _new_ vdso region.
+ +-> | __vdso_gettimeofday | <-+
+ | .. |
+ +---------------------+
+```
+
+This approach introduces the need for a higher-level synchronization as it must
+be ensured that no thread is in the middle of executing a `vdso` function when
+creating the process checkpoint. This PoC doesn't take this into account as it
+merely focuses on the mechanics described above.
+
+## License
+This project is licensed under the [MIT](LICENSE) license.
+
+[man-mremap]: https://man7.org/linux/man-pages/man2/mremap.2.html
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..18c1b74
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,184 @@
+//! Collection of some abstractions and the error definition used by this PoC.
+
+/// Errors that can occur in this PoC.
+#[derive(Debug)]
+pub enum Error {
+ /// Failed to open and read the `/proc/self/maps` file.
+ FailedToReadMaps,
+ /// Failed to a parse line from the `/proc/self/maps` file.
+ /// Captures line that failed to parse.
+ ParseMapEntryError(String),
+ /// No `[vdso]` segment found in `/proc/self/maps`.
+ VdsoSegmentNotFound,
+ /// Failed to parse bytes as ELF file.
+ FailedToParseAsElf,
+ /// No `PT_LOAD` program header found in the ELF file.
+ LoadPhdrNotFound,
+ /// Requested symbol not found in the ELF file.
+ /// Captures the name of the requested symbol.
+ SymbolNotFound(String),
+}
+
+/// Representation of an 64-bit virtual address.
+#[derive(Debug, Copy, Clone)]
+pub struct VirtAddr(pub u64);
+
+/// Represents an entry from the `/proc/self/maps` file with the information needed by this PoC.
+/// ```text
+/// man 5 proc
+/// address perms offset dev inode pathname
+/// 00400000-00452000 r-xp 00000000 08:02 173521 /usr/bin/dbus-daemon
+/// ```
+#[derive(Debug)]
+pub struct MapEntry {
+ /// Start address of the memory segment.
+ pub addr: u64,
+ /// Length of the memory segment.
+ pub len: u64,
+ /// Optional name of the memory segment.
+ pub name: Option<String>,
+}
+
+impl MapEntry {
+ /// Try to parse a [`MapEntry`] from the `line` passed as argument.
+ pub fn from_line<'a>(line: &'a str) -> Result<MapEntry, Error> {
+ let expect = |tok: Option<&'a str>| tok.ok_or(Error::ParseMapEntryError(line.into()));
+
+ // Tokenize the line.
+ let mut toks = line.split_whitespace();
+ let addr = expect(toks.next())?;
+ let _perms = expect(toks.next())?;
+ let _offset = expect(toks.next())?;
+ let _dev = expect(toks.next())?;
+ let _inode = expect(toks.next())?;
+ let name = toks.next().map(|name| String::from(name));
+
+ // Parse the address token.
+ let (addr, len) = {
+ let tou64 = |s: &'a str| {
+ u64::from_str_radix(s, 16)
+ .map_err(|e| Error::ParseMapEntryError(format!("{}\n{}", line, e)))
+ };
+
+ let mut toks = addr.split('-');
+ let start = tou64(expect(toks.next())?)?;
+ let end = tou64(expect(toks.next())?)?;
+
+ (start, end - start)
+ };
+
+ Ok(MapEntry { addr, len, name })
+ }
+}
+
+/// Owned [`libc::mmap`] allocation.
+pub struct Mmap {
+ ptr: *mut libc::c_void,
+ len: usize,
+ map: MapEntry,
+}
+
+impl Mmap {
+ /// Create a new allocation with `read | write | execute` permissions big enough to hold a copy
+ /// of `bytes` and initialize it with the `bytes` passed as argument.
+ pub fn new_rwx_from(bytes: &[u8]) -> Option<Mmap> {
+ use libc::{
+ memcpy, mmap, sysconf, MAP_ANONYMOUS, MAP_FAILED, MAP_PRIVATE, PROT_EXEC, PROT_READ,
+ PROT_WRITE, _SC_PAGESIZE,
+ };
+
+ // Get the page size.
+ let page_size = unsafe { sysconf(_SC_PAGESIZE) } as usize;
+
+ // Compute required size for the new allocation by rounding up to the next page size.
+ let len = ((bytes.len() + page_size - 1) / page_size) * page_size;
+
+ // Allocate new `rwx` memory segment.
+ let ptr = unsafe {
+ mmap(
+ std::ptr::null_mut(),
+ len,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ -1, /* fd */
+ 0, /* offset */
+ )
+ };
+
+ if ptr == MAP_FAILED {
+ return None;
+ }
+
+ unsafe {
+ // Initialize new allocation with `bytes` passed as argument.
+ memcpy(ptr, bytes.as_ptr().cast(), bytes.len());
+ }
+
+ Some(Mmap {
+ ptr,
+ len,
+ map: MapEntry {
+ addr: ptr as u64,
+ len: len as u64,
+ name: Some("mmap_rwx".into()),
+ },
+ })
+ }
+}
+
+impl Drop for Mmap {
+ fn drop(&mut self) {
+ unsafe { libc::munmap(self.ptr, self.len) };
+ }
+}
+
+impl AsRef<MapEntry> for Mmap {
+ fn as_ref(&self) -> &'_ MapEntry {
+ &self.map
+ }
+}
+
+impl AsMut<[u8]> for Mmap {
+ fn as_mut(&mut self) -> &mut [u8] {
+ unsafe { std::slice::from_raw_parts_mut(self.ptr.cast(), self.len) }
+ }
+}
+
+/// An `x86_64` jump pad (trampoline) that that can be installed at a [`VirtAddr`].
+///
+/// The jump pad is implemented as:
+/// ```asm
+/// mov rax, imm64 ; target
+/// jmp rax
+/// ```
+#[allow(dead_code)]
+#[cfg(target_arch = "x86_64")]
+#[repr(packed)]
+pub struct JmpPad {
+ movabs: u16,
+ target: u64,
+ jmp_rax: u16,
+}
+
+#[cfg(target_arch = "x86_64")]
+impl JmpPad {
+ /// Initialize a new jump pad to the destination virtual address `target`.
+ /// This does not install the jump pad.
+ pub fn to(target: VirtAddr) -> JmpPad {
+ JmpPad {
+ movabs: 0xb848, // REX.W + mov rax, imm64
+ target: target.0,
+ jmp_rax: 0xe0ff, // jmp rax
+ }
+ }
+
+ /// Install the jump pad at the virtual address `addr`.
+ ///
+ /// # Safety
+ /// The caller must guarantee the following constraints:
+ /// - `addr` must be a valid virtual address referring to writeable memory.
+ /// - There must be enough space to store [`size_of::<JmpPad>()`](core::mem::size_of) bytes.
+ pub unsafe fn install_at(self, addr: VirtAddr) {
+ std::ptr::write(addr.0 as *mut JmpPad, self);
+ }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..400b2d7
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,144 @@
+use goblin::elf::{Elf, program_header::PT_LOAD};
+use std::convert::TryFrom;
+use vdso_proxy_poc::{Error, JmpPad, MapEntry, Mmap, VirtAddr};
+
+#[cfg(not(target_os = "linux"))]
+compile_error!("This only makes sense on Linux, as we are poking the vdso.");
+
+/// Find the `[vdso]` entry in `/proc/self/maps`.
+fn get_vdso() -> Result<MapEntry, Error> {
+ for line in std::fs::read_to_string("/proc/self/maps")
+ .map_err(|_| Error::FailedToReadMaps)?
+ .lines()
+ {
+ let map = MapEntry::from_line(line)?;
+ match &map.name {
+ Some(n) if n == "[vdso]" => return Ok(map),
+ _ => {}
+ }
+ }
+ Err(Error::VdsoSegmentNotFound)
+}
+
+/// Create a copy of the `vdso` memory segment. Effectively allocates memory and copies the virtual
+/// address range described by `vdso`.
+///
+/// # Safety:
+/// The caller must guarantee that the `vdso` argument describes a valid virtual address range by
+/// its `address` and `length` fields.
+#[allow(unused_unsafe)]
+unsafe fn copy_vdso(vdso: &MapEntry) -> Option<Mmap> {
+ let bytes = {
+ let ptr = vdso.addr as *const u8;
+ let len = usize::try_from(vdso.len)
+ .expect("It's required that the segment length fits into a usize!");
+ // SAFETY: Validity of ptr & len must be ensured by the caller.
+ unsafe { std::slice::from_raw_parts(ptr, len) }
+ };
+
+ Mmap::new_rwx_from(&bytes)
+}
+
+/// Find the `symbol_name` in the vdso described by the [`MapEntry`] memory segment.
+///
+/// # Safety:
+/// The caller must guarantee that the `vdso` argument describes a valid virtual address range by
+/// its `address` and `length` fields.
+///
+/// # Note:
+/// Currently the version of the symbol is not checked, technically this is an error which can be
+/// fatal in case of a binary incompatibility, but that's accepted for this PoC.
+#[allow(unused_unsafe)]
+unsafe fn get_vdso_sym(vdso: &MapEntry, symbol_name: &str) -> Result<VirtAddr, Error> {
+ // Turn `vdso` maps entry into slice of bytes.
+ let bytes = {
+ let ptr = vdso.addr as *const u8;
+ let len = usize::try_from(vdso.len)
+ .expect("It's required that the segment length fits into a usize!");
+ // SAFETY: Validity of ptr & len must be ensured by the caller.
+ unsafe { std::slice::from_raw_parts(ptr, len) }
+ };
+
+ // Parse vdso bytes as ELF.
+ let elf = Elf::parse(bytes).map_err(|_| Error::FailedToParseAsElf)?;
+
+ // Compute the dynamic shared object (dso) base address. Symbol offsets are relative to this
+ // dso base address.
+ let dso_base = {
+ let phdr_load = elf
+ .program_headers
+ .iter()
+ .find(|p| p.p_type == PT_LOAD)
+ .ok_or(Error::LoadPhdrNotFound)?;
+ vdso.addr - phdr_load.p_offset - phdr_load.p_vaddr
+ };
+ assert_ne!(dso_base, 0, "If the dso base address is 0 that means the symbols contain absolute addresses, we don't want to support that!");
+
+ // Try to find the requested symbol.
+ let sym = elf
+ .dynsyms
+ .iter()
+ .filter(|sym| sym.is_function())
+ .find(|sym| matches!(elf.dynstrtab.get_at(sym.st_name), Some(sym) if sym == symbol_name))
+ .ok_or(Error::SymbolNotFound(symbol_name.into()))?;
+
+ // Compute the absolute virtual address of the requested symbol.
+ Ok(VirtAddr(dso_base + sym.st_value))
+}
+
+/// Represent the `struct timeval` C structure (see `man 2 gettimeofday`).
+#[repr(C)]
+struct Timeval {
+ tv_sec: i64,
+ tv_usec: i64,
+}
+
+fn main() -> Result<(), Error> {
+ // This represents the _new_ vdso pages that the kernel mapped into the restoring process.
+ let orig_vdso = get_vdso()?;
+
+ // This represents the _old_ vdso pages that were captured in the memory dump of the process
+ // checkpoint.
+ //
+ // SAFETY: orig_vdso describes a valid memory region as we got it from /proc/self/maps.
+ let copy_vdso = unsafe { copy_vdso(&orig_vdso).expect("Copy of vdso must succeed!") };
+
+ let (orig_sym_addr, copy_sym_addr) = unsafe {
+ // SAFETY: orig_vdso describes a valid memory region as we got it from /proc/self/maps.
+ let orig = get_vdso_sym(&orig_vdso, "__vdso_gettimeofday")?;
+ // SAFETY: copy_vdso describes a valid and owned memory allocation.
+ let copy = get_vdso_sym(&copy_vdso.as_ref(), "__vdso_gettimeofday")?;
+
+ (orig, copy)
+ };
+
+ // As an example, install a trampoline for the `__vdso_gettimeofday` symbol. The trampoline is
+ // installed in the _old_ vdso pages, where the user code from the checkpoint image links to,
+ // and forwards the calls into the _new_ vdso pages.
+ let pad = JmpPad::to(orig_sym_addr);
+ // SAFETY: copy_sym_addr is a valid virtual address as we got it from the symbol lookup.
+ unsafe { pad.install_at(copy_sym_addr) };
+
+ let mut tv: Timeval = Timeval {
+ tv_sec: 0,
+ tv_usec: 0,
+ };
+
+ unsafe {
+ // Mimic a call to `__vdso_gettimeofday` from user code which is still linked to the _old_
+ // vdso.
+
+ // SAFETY: copy_sym_addr is a valid virtual address pointing to the `__vdso_gettimeofday`
+ // function.
+ let gettimeofday: extern "C" fn(*mut Timeval, *mut libc::c_void) -> i32 =
+ std::mem::transmute(copy_sym_addr.0 as *const ());
+
+ // Invoke the `__vdso_gettimeofday` function in the copied memory region (_old_ vdso). This
+ // should forward to the function in the original memory region.
+ gettimeofday(&mut tv as *mut Timeval, std::ptr::null_mut());
+ }
+
+ println!("Timeval tv_sec : {} tv_usec : {}", tv.tv_sec, tv.tv_usec);
+
+ Ok(())
+}