From 42489752b4c78f6bbc80bb56a4347b692a067c29 Mon Sep 17 00:00:00 2001
From: Sean Whitton <spwhitton@spwhitton.name>
Date: Sun, 25 Jul 2021 13:03:57 -0700
Subject: add Linux namespace-entering connections

Signed-off-by: Sean Whitton <spwhitton@spwhitton.name>
---
 src/connection/fork.lisp            |  49 +++++++
 src/connection/linux-namespace.lisp | 253 ++++++++++++++++++++++++++++++++++++
 2 files changed, 302 insertions(+)
 create mode 100644 src/connection/linux-namespace.lisp

(limited to 'src/connection')

diff --git a/src/connection/fork.lisp b/src/connection/fork.lisp
index 16c9328..aa1626e 100644
--- a/src/connection/fork.lisp
+++ b/src/connection/fork.lisp
@@ -35,3 +35,52 @@ Must not start up any threads."))
      exit
      :on-failure (failed-change
                   "~&Fork connection child failed; stderr was ~%~%~A" err))))
+
+
+;;;; Dumping and then immediately reinvoking Lisp
+
+(defclass init-hooks-connection (fork-connection) ()
+  (:documentation "On SBCL, call POST-FORK using SB-EXT:*INIT-HOOKS*.
+
+The primary purpose of this connection type is to obtain a truly
+single-threaded context for the execution of POST-FORK."))
+
+#+(and sbcl sb-thread)
+(eval-when (:compile-toplevel :load-toplevel :execute)
+  ;; UIOP:VERSION< cannot handle Debian-patched SBCL version numbers, so we
+  ;; split it up ourselves.
+  (destructuring-bind (major minor patch . rest)
+      (mapcar (lambda (s) (parse-integer s :junk-allowed t))
+              (split-string (lisp-implementation-version) :separator '(#\.)))
+    (declare (ignore rest))
+    (unless (or (> major 2)
+                (and (= major 2)
+                     (or (> minor 1) (and (= minor 1) (> patch 7)))))
+      (pushnew 'older-sbcl *features*))))
+
+#+sbcl
+(defmethod continue-connection ((connection init-hooks-connection) remaining)
+  (eval-in-reinvoked
+      `(push
+        (lambda ()
+          (handler-bind
+              ((serious-condition
+                 (lambda (c)
+                   (trivial-backtrace:print-backtrace c :output *error-output*)
+                   (uiop:quit 3))))
+            ;; Handle the finaliser thread in older SBCL, before the change in
+            ;; 2.1.8 to call *INIT-HOOKS* before starting system threads.
+            #+consfigurator.connection.fork::older-sbcl
+            (sb-int:with-system-mutex (sb-thread::*make-thread-lock*)
+              (sb-impl::finalizer-thread-stop))
+            (post-fork ,connection)
+            #+consfigurator.connection.fork::older-sbcl
+            (sb-impl::finalizer-thread-start)))
+        sb-ext:*init-hooks*)
+      `(continue-deploy* ,connection ',remaining) (out err exit)
+    (when-let ((lines (lines out)))
+      (inform t lines))
+    (return-exit
+     exit
+     :on-failure (failed-change
+                  "~&Reinvoked Lisp image failed; stderr was ~%~%~A" err))))
diff --git a/src/connection/linux-namespace.lisp b/src/connection/linux-namespace.lisp
new file mode 100644
index 0000000..89b5daf
--- /dev/null
+++ b/src/connection/linux-namespace.lisp
@@ -0,0 +1,253 @@
+;;; Consfigurator -- Lisp declarative configuration management system
+
+;;; Copyright (C) 2021  Sean Whitton <spwhitton@spwhitton.name>
+
+;;; This file is free software; you can redistribute it and/or modify
+;;; it under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation; either version 3, or (at your option)
+;;; any later version.
+
+;;; This file is distributed in the hope that it will be useful,
+;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;; GNU General Public License for more details.
+
+;;; You should have received a copy of the GNU General Public License
+;;; along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+(in-package :consfigurator.connection.linux-namespace)
+(named-readtables:in-readtable :consfigurator)
+
+;;; 'SETNS is SBCL-specific due to handling of non-user threads like the
+;;; finaliser thread: we must be truly single-threaded in order to enter a
+;;; different user namespace.  If we can't use it then we fall back to a
+;;; POSIX-type connection into the container.  In the latter situation the
+;;; user could follow the :SYSTEMD-MACHINED connection with a connection type
+;;; which starts up a remote Lisp image within the container.  This will be
+;;; significantly slower, so if there is more than one container it will
+;;; probably be worth arranging for the use of 'SETNS.
+;;;
+;;; An alternative to calling setns(2) ourselves, in the Lisp-type connection
+;;; case, might be to dump an image, write it to a temporary file within the
+;;; container using WITH-REMOTE-TEMPORARY-FILE & nsenter(1), and then reinvoke
+;;; the image, again using nsenter(1).  This would be more portable with
+;;; respect to Lisp implementations.  However, it would run into the problems
+;;; described in "Dumping and reinvoking Lisp" in pitfalls.rst.  With the
+;;; current approach, shared library dependencies need be available only
+;;; outside the container, and then after entering the container they will
+;;; still be usable.
+
+(defmethod establish-connection
+    ((type (eql :systemd-machined)) remaining &key name uid gid)
+  (establish-connection
+   #+sbcl (if (lisp-connection-p) 'setns :nsenter) #-sbcl :nsenter remaining
+   :pid (or (loop for line in (runlines "machinectl" "show" name)
+                  when (string-prefix-p "Leader=" line)
+                    return (subseq line 7))
+            (error "Could not determine PID for machine ~A." name))
+   :uid uid :gid gid))
+
+
+;;;; :NSENTER
+
+(defmethod establish-connection
+    ((type (eql :nsenter)) remaining &key pid uid gid)
+  (declare (ignore remaining))
+  (informat 1 "~&Entering namespaces of PID ~D with nsenter(1)" pid)
+  (make-instance 'nsenter-connection :pid pid :uid uid :gid gid))
+
+(defclass linux-namespace-connection ()
+  ((pid :type integer :initarg :pid
+        :initform (simple-program-error "Must supply namespace leader PID.")
+        :documentation
+        "A PID of a process which is already within all of the namespaces.")
+   (env :type list :initform nil)
+   (uid :initarg :uid :initform nil)
+   (gid :initarg :gid :initform nil))
+  (:documentation
+   "A connection which works by reassociating to a set of Linux namespaces."))
+
+(defmethod initialize-instance :after
+    ((connection linux-namespace-connection) &key)
+  (with-slots (pid env) connection
+    ;; In the case that PID is in another user namespace, we won't be able to
+    ;; read the environment unless we're either root or we first enter the
+    ;; other user namespace.
+    (let ((cmd
+#?"cat /proc/${pid}/environ || nsenter -U -t ${pid} cat /proc/${pid}/environ"))
+      (dolist (entry (split-string (run cmd) :separator '(#.(code-char 0))))
+        (when (plusp (length entry))
+          (push entry env))))))
+
+(defclass nsenter-connection
+    (linux-namespace-connection shell-wrap-connection) ())
+
+(defmethod connection-shell-wrap ((connection nsenter-connection) cmd)
+  (with-slots (pid uid gid env) connection
+    (format
+     nil
+     "nsenter ~@[-S ~D ~]~@[-G ~D ~]-at ~D env -i ~{~A~^ ~} sh -c ~A"
+     uid gid pid (mapcar #'escape-sh-token env) (escape-sh-token cmd))))
+
+
+;;;; 'SETNS
+
+;;; Whether to setuid/setgid: setting aside --preserve-credentials, which we
+;;; don't support, nsenter(1) will setuid (resp. setgid) when a UID
+;;; (resp. GID) is supplied on the command line, or when asked to enter a new
+;;; user namespace, either with -U or -t.  In the latter case both default to
+;;; 0.  So that we can abstract over 'SETNS and :NSENTER using a connection
+;;; type like :SYSTEMD-MACHINED, we mirror these semantics in our 'SETNS.
+#+sbcl
+(defmethod establish-connection
+    ((type (eql 'setns)) remaining &key pid uid gid)
+  "Use setns(2) to enter the Linux namespaces of process PID.  Additionally,
+
+- If PID has a distinct user namespace and we have permission to setgroups(2)
+  in the initial user namespace, then before entering the target userns,
+
+  - if the target userns is owned by root, clear supplementary groups
+
+  - if the target userns is owned by nonroot, call initgroups(3) to assume the
+    supplementary groups of the owner.
+
+- After entering the target namespaces:
+
+  - If UID, or PID has a distinct userns, attempt to setuid(2) to UID, in the
+    latter case defaulting UID to 0.  Also change to UID's home directory, and
+    update HOME, PATH, USER and LOGNAME environment variables.
+
+  - If GID, or PID has a distinct userns, attempt to setgid(2) to GID, in the
+    latter case defaulting GID to 0.  Also, if setgroups(2) is permitted
+    within the target user namespace,
+
+    - if we also called setuid(2) then call initgroups(3) to assume the
+      supplementary groups belonging to UID
+
+    - if we called only setgid(2), clear supplementary groups.
+
+Thus, if PID has a distinct userns then the userns's uid_map and gid_map must
+already have been written, and must include mappings for UID and GID, which
+default to 0 and 0.  It is not an error if we do not have the ability to
+setgroups(2) in either the starting user namespace or the target user
+namespace, in each case either due to a lack of privilege or because
+setgroups(2) is denied in the namespace."
+  (informat 1 "~&Reassociating to Linux namespaces of PID ~D" pid)
+  (alet (make-instance 'nsenter-connection :pid pid :uid uid :gid gid)
+    (upload-all-prerequisite-data it)
+    (change-class it 'setns-connection)
+    (continue-connection it remaining)))
+
+(defclass setns-connection
+    (linux-namespace-connection init-hooks-connection) ())
+
+(define-constant +namespace-types+ `(("user"   . ,+CLONE_NEWUSER+)
+                                     ("cgroup" . ,+CLONE_NEWCGROUP+)
+                                     ("ipc"    . ,+CLONE_NEWIPC+)
+                                     ("uts"    . ,+CLONE_NEWUTS+)
+                                     ("net"    . ,+CLONE_NEWNET+)
+                                     ("pid"    . ,+CLONE_NEWPID+)
+                                     ("mnt"    . ,+CLONE_NEWNS+)
+                                     ("time"   . ,+CLONE_NEWTIME+))
+  :test #'equal)
+
+(define-error-retval-cfun () "setns" :int (fd :int) (type :int))
+
+#+sbcl
+(defmethod post-fork ((connection setns-connection))
+  (with-slots (pid uid gid env) connection
+    (let* (user opened-fds
+           ;; Check whether the target user namespace is the current user
+           ;; namespace because it is never permitted to reenter the current
+           ;; user namespace using setns(2).
+           (us (nix:stat #?"/proc/${(nix:getpid)}/ns/user"))
+           (them (nix:stat #?"/proc/${pid}/ns/user"))
+           (setuserns (not (and (= (nix:stat-dev us) (nix:stat-dev them))
+                                (= (nix:stat-ino us) (nix:stat-ino them)))))
+           (uid (or uid (and setuserns 0)))
+           (gid (or gid (and setuserns 0))))
+      (unwind-protect
+           (flet ((sysopen (path)
+                    (aprog1 (nix:open path nix:O-RDONLY)
+                      (push it opened-fds))))
+             (let ((ns-fds
+                     (loop for (name . type) in (if setuserns
+                                                    +namespace-types+
+                                                    (cdr +namespace-types+))
+                           collect
+                           (cons (sysopen #?"/proc/${pid}/ns/${name}") type)))
+                   (root-fd (sysopen #?"/proc/${pid}/root")))
+               (when (and setuserns (setgroups-p))
+                 (let ((owner (get-userns-owner (caar ns-fds))))
+                   (if (zerop owner)
+                       (nix:setgroups nil)
+                       (alet (osicat:user-info owner)
+                         ;; As a precaution, we could also setuid & setgid to
+                         ;; OWNER here.  However, it ought to be meaningless
+                         ;; to do so, because a process loses all capabilities
+                         ;; in the parent or previous user namespace.
+                         (nix:initgroups (cdr (assoc :name it))
+                                         (cdr (assoc :group-id it)))))))
+               ;; Reset to a standard SHELL and PATH & clear out rest of env.
+               (posix-login-environment)
+               ;; It might be that we need to enter the userns in order to
+               ;; have the capability to enter the other namespaces, or it
+               ;; might be that we want to enter the userns in order to reduce
+               ;; our privilege.  So that we can handle both of these, try
+               ;; entering the other namespaces both before and after entering
+               ;; the new user namespace; the first set of attempts will fail
+               ;; silently in the former case.
+               ;;
+               ;; This technique is based on part of nsenter(1) from
+               ;; util-linux, but additionally, in the first pass we accept
+               ;; errors only of type EPERM, and see below about EPERM in the
+               ;; second pass.
+               (when setuserns
+                 (loop for cell on (cdr ns-fds)
+                       do (handler-case (setns (caar cell) (cdar cell))
+                            (nix:eperm ())
+                            (:no-error (&rest ignore)
+                              (declare (ignore ignore))
+                              (rplaca (car cell) nil)))))
+               (loop for (fd . type) in ns-fds
+                     ;; Accept failures due to insufficient capabilities which
+                     ;; occur after we've entered the new userns, as that
+                     ;; indicates that the namespace we tried to join belongs
+                     ;; to a parent userns, in which case if we were ever
+                     ;; going to join it would have to have been on 1st pass.
+                     if (and fd setuserns (not (eql type +CLONE_NEWUSER+)))
+                       do (handler-case (setns fd type) (nix:eperm ()))
+                     else if fd do (setns fd type))
+               ;; If we entered new PID or time namespaces then need to fork
+               ;; so we're actually within them; for simplicity, always fork.
+               (mapc-open-output-streams
+                #'force-output
+                *standard-output* *error-output* *debug-io* *terminal-io*)
+               (let ((child (nix:fork)))
+                 (when (plusp child)
+                   (let ((status (nth-value 1 (nix:waitpid child))))
+                     (if (nix:WIFEXITED status)
+                         (uiop:quit (nix:WEXITSTATUS status))
+                         (error
+                          "PID namespace child did not exit normally.")))))
+               ;; If the namespace leader is chrooted then we want to be too.
+               (nix:fchdir root-fd) (chroot ".")))
+        (mapc #'nix:close opened-fds))
+      (when uid
+        (alet (or (osicat:user-info uid)
+                  (error "~&Could not look up user info for UID ~A." uid))
+          (setf user (cdr (assoc :name it)))
+          (posix-login-environment uid user (cdr (assoc :home it)))))
+      (dolist (entry env)
+        (let* ((pos (position #\= entry))
+               (var (subseq entry 0 pos))
+               (val (subseq entry (1+ pos))))
+          (unless
+              (and uid
+                   (memstring= var '("HOME" "SHELL" "USER" "LOGNAME" "PATH")))
+            (setf (getenv var) val))))
+      (when gid
+        (nix:setgid gid)
+        (when (setgroups-p)
+          (if uid (nix:initgroups user gid) (nix:setgroups nil))))
+      (when uid (nix:setuid uid)))))
-- 
cgit v1.2.3