1 files changed, 91 insertions, 156 deletions
diff --git a/sys/security/audit/audit_worker.c b/sys/security/audit/audit_worker.c
index 102e9f9..467c5b2 100644
--- a/sys/security/audit/audit_worker.c
+++ b/sys/security/audit/audit_worker.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 1999-2005 Apple Computer, Inc.
- * Copyright (c) 2006 Robert N. M. Watson
+ * Copyright (c) 2006-2008 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -48,6 +48,7 @@
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
+#include <sys/sx.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
@@ -75,31 +76,18 @@
 static struct proc		*audit_thread;
 
 /*
- * When an audit log is rotated, the actual rotation must be performed by the
- * audit worker thread, as it may have outstanding writes on the current
- * audit log.  audit_replacement_vp holds the vnode replacing the current
- * vnode.  We can't let more than one replacement occur at a time, so if more
- * than one thread requests a replacement, only one can have the replacement
- * "in progress" at any given moment.  If a thread tries to replace the audit
- * vnode and discovers a replacement is already in progress (i.e.,
- * audit_replacement_flag != 0), then it will sleep on audit_replacement_cv
- * waiting its turn to perform a replacement.  When a replacement is
- * completed, this cv is signalled by the worker thread so a waiting thread
- * can start another replacement.  We also store a credential to perform
- * audit log write operations with.
- *
- * The current credential and vnode are thread-local to audit_worker.
- */
-static struct cv		audit_replacement_cv;
-
-static int			audit_replacement_flag;
-static struct vnode		*audit_replacement_vp;
-static struct ucred		*audit_replacement_cred;
-
-/*
- * Flags related to Kernel->user-space communication.
+ * audit_cred and audit_vp are the stored credential and vnode to use for
+ * active audit trail.  They are protected by audit_worker_sx, which will be
+ * held across all I/O and all rotation to prevent them from being replaced
+ * (rotated) while in use.  The audit_file_rotate_wait flag is set when the
+ * kernel has delivered a trigger to auditd to rotate the trail, and is
+ * cleared when the next rotation takes place.  It is also protected by
+ * audit_worker_sx.
  */
-static int			audit_file_rotate_wait;
+static int		 audit_file_rotate_wait;
+static struct sx	 audit_worker_sx;
+static struct ucred	*audit_cred;
+static struct vnode	*audit_vp;
 
 /*
  * Write an audit record to a file, performed as the last stage after both
@@ -110,8 +98,8 @@ static int			audit_file_rotate_wait;
  * the audit daemon, since the message is asynchronous anyway.
  */
 static void
-audit_record_write(struct vnode *vp, struct ucred *cred, struct thread *td,
-    void *data, size_t len)
+audit_record_write(struct vnode *vp, struct ucred *cred, void *data,
+    size_t len)
 {
 	static struct timeval last_lowspace_trigger;
 	static struct timeval last_fail;
@@ -122,6 +110,8 @@ audit_record_write(struct vnode *vp, struct ucred *cred, struct thread *td,
 	struct vattr vattr;
 	long temp;
 
+	sx_assert(&audit_worker_sx, SA_LOCKED);	/* audit_file_rotate_wait. */
+
 	if (vp == NULL)
 		return;
 
@@ -133,11 +123,11 @@ audit_record_write(struct vnode *vp, struct ucred *cred, struct thread *td,
 	 * that we know how we're doing on space.  Consider failure of these
 	 * operations to indicate a future inability to write to the file.
 	 */
-	error = VFS_STATFS(vp->v_mount, mnt_stat, td);
+	error = VFS_STATFS(vp->v_mount, mnt_stat, curthread);
 	if (error)
 		goto fail;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_GETATTR(vp, &vattr, cred, td);
+	error = VOP_GETATTR(vp, &vattr, cred, curthread);
 	VOP_UNLOCK(vp, 0);
 	if (error)
 		goto fail;
@@ -200,6 +190,8 @@ audit_record_write(struct vnode *vp, struct ucred *cred, struct thread *td,
 	 */
 	if ((audit_fstat.af_filesz != 0) && (audit_file_rotate_wait == 0) &&
 	    (vattr.va_size >= audit_fstat.af_filesz)) {
+		sx_assert(&audit_worker_sx, SA_XLOCKED);
+
 		audit_file_rotate_wait = 1;
 		(void)send_trigger(AUDIT_TRIGGER_ROTATE_KERNEL);
 	}
@@ -234,7 +226,7 @@ audit_record_write(struct vnode *vp, struct ucred *cred, struct thread *td,
 	}
 
 	error = vn_rdwr(UIO_WRITE, vp, data, len, (off_t)0, UIO_SYSSPACE,
-	    IO_APPEND|IO_UNIT, cred, NULL, NULL, td);
+	    IO_APPEND|IO_UNIT, cred, NULL, NULL, curthread);
 	if (error == ENOSPC)
 		goto fail_enospc;
 	else if (error)
@@ -252,7 +244,7 @@ audit_record_write(struct vnode *vp, struct ucred *cred, struct thread *td,
 	if (audit_in_failure) {
 		if (audit_q_len == 0 && audit_pre_q_len == 0) {
 			VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK);
-			(void)VOP_FSYNC(vp, MNT_WAIT, td);
+			(void)VOP_FSYNC(vp, MNT_WAIT, curthread);
 			VOP_UNLOCK(vp, 0);
 			panic("Audit store overflow; record queue drained.");
 		}
@@ -269,7 +261,7 @@ fail_enospc:
 	 */
 	if (audit_fail_stop) {
 		VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK);
-		(void)VOP_FSYNC(vp, MNT_WAIT, td);
+		(void)VOP_FSYNC(vp, MNT_WAIT, curthread);
 		VOP_UNLOCK(vp, 0);
 		panic("Audit log space exhausted and fail-stop set.");
 	}
@@ -284,7 +276,7 @@ fail:
 	 */
 	if (audit_panic_on_write_fail) {
 		VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK);
-		(void)VOP_FSYNC(vp, MNT_WAIT, td);
+		(void)VOP_FSYNC(vp, MNT_WAIT, curthread);
 		VOP_UNLOCK(vp, 0);
 		panic("audit_worker: write error %d\n", error);
 	} else if (ppsratecheck(&last_fail, &cur_fail, 1))
@@ -293,62 +285,6 @@ fail:
 }
 
 /*
- * If an appropriate signal has been received rotate the audit log based on
- * the global replacement variables.  Signal consumers as needed that the
- * rotation has taken place.
- *
- * The global variables and CVs used to signal the audit_worker to perform a
- * rotation are essentially a message queue of depth 1.  It would be much
- * nicer to actually use a message queue.
- */
-static void
-audit_worker_rotate(struct ucred **audit_credp, struct vnode **audit_vpp,
-    struct thread *audit_td)
-{
-	int do_replacement_signal, vfslocked;
-	struct ucred *old_cred;
-	struct vnode *old_vp;
-
-	mtx_assert(&audit_mtx, MA_OWNED);
-
-	do_replacement_signal = 0;
-	while (audit_replacement_flag != 0) {
-		old_cred = *audit_credp;
-		old_vp = *audit_vpp;
-		*audit_credp = audit_replacement_cred;
-		*audit_vpp = audit_replacement_vp;
-		audit_replacement_cred = NULL;
-		audit_replacement_vp = NULL;
-		audit_replacement_flag = 0;
-
-		audit_enabled = (*audit_vpp != NULL);
-
-		if (old_vp != NULL) {
-			mtx_unlock(&audit_mtx);
-			vfslocked = VFS_LOCK_GIANT(old_vp->v_mount);
-			vn_close(old_vp, AUDIT_CLOSE_FLAGS, old_cred,
-			    audit_td);
-			VFS_UNLOCK_GIANT(vfslocked);
-			crfree(old_cred);
-			mtx_lock(&audit_mtx);
-			old_cred = NULL;
-			old_vp = NULL;
-		}
-		do_replacement_signal = 1;
-	}
-
-	/*
-	 * Signal that replacement have occurred to wake up and start any
-	 * other replacements started in parallel.  We can continue about our
-	 * business in the mean time.  We broadcast so that both new
-	 * replacements can be inserted, but also so that the source(s) of
-	 * replacement can return successfully.
-	 */
-	if (do_replacement_signal)
-		cv_broadcast(&audit_replacement_cv);
-}
-
-/*
  * Given a kernel audit record, process as required.  Kernel audit records
  * are converted to one, or possibly two, BSM records, depending on whether
  * there is a user audit record present also.  Kernel records need be
@@ -356,23 +292,38 @@ audit_worker_rotate(struct ucred **audit_credp, struct vnode **audit_vpp,
  * written to disk, and audit pipes.
  */
 static void
-audit_worker_process_record(struct vnode *audit_vp, struct ucred *audit_cred,
-    struct thread *audit_td, struct kaudit_record *ar)
+audit_worker_process_record(struct kaudit_record *ar)
 {
 	struct au_record *bsm;
 	au_class_t class;
 	au_event_t event;
 	au_id_t auid;
 	int error, sorf;
+	int trail_locked;
+
+	/*
+	 * We hold the audit_worker_sx lock over both writes, if there are
+	 * two, so that the two records won't be split across a rotation and
+	 * end up in two different trail files.
+	 */
+	if (((ar->k_ar_commit & AR_COMMIT_USER) &&
+	    (ar->k_ar_commit & AR_PRESELECT_USER_TRAIL)) ||
+	    (ar->k_ar_commit & AR_PRESELECT_TRAIL)) {
+		sx_xlock(&audit_worker_sx);
+		trail_locked = 1;
+	} else
+		trail_locked = 0;
 
 	/*
 	 * First, handle the user record, if any: commit to the system trail
 	 * and audit pipes as selected.
 	 */
 	if ((ar->k_ar_commit & AR_COMMIT_USER) &&
-	    (ar->k_ar_commit & AR_PRESELECT_USER_TRAIL))
-		audit_record_write(audit_vp, audit_cred, audit_td,
-		    ar->k_udata, ar->k_ulen);
+	    (ar->k_ar_commit & AR_PRESELECT_USER_TRAIL)) {
+		sx_assert(&audit_worker_sx, SA_XLOCKED);
+		audit_record_write(audit_vp, audit_cred, ar->k_udata,
+		    ar->k_ulen);
+	}
 
 	if ((ar->k_ar_commit & AR_COMMIT_USER) &&
 	    (ar->k_ar_commit & AR_PRESELECT_USER_PIPE))
@@ -381,7 +332,7 @@ audit_worker_process_record(struct vnode *audit_vp, struct ucred *audit_cred,
 	if (!(ar->k_ar_commit & AR_COMMIT_KERNEL) ||
 	    ((ar->k_ar_commit & AR_PRESELECT_PIPE) == 0 &&
 	    (ar->k_ar_commit & AR_PRESELECT_TRAIL) == 0))
-		return;
+		goto out;
 
 	auid = ar->k_ar.ar_subj_auid;
 	event = ar->k_ar.ar_event;
@@ -394,11 +345,11 @@ audit_worker_process_record(struct vnode *audit_vp, struct ucred *audit_cred,
 	error = kaudit_to_bsm(ar, &bsm);
 	switch (error) {
 	case BSM_NOAUDIT:
-		return;
+		goto out;
 
 	case BSM_FAILURE:
 		printf("audit_worker_process_record: BSM_FAILURE\n");
-		return;
+		goto out;
 
 	case BSM_SUCCESS:
 		break;
@@ -407,9 +358,10 @@ audit_worker_process_record(struct vnode *audit_vp, struct ucred *audit_cred,
 		panic("kaudit_to_bsm returned %d", error);
 	}
 
-	if (ar->k_ar_commit & AR_PRESELECT_TRAIL)
-		audit_record_write(audit_vp, audit_cred, audit_td, bsm->data,
-		    bsm->len);
+	if (ar->k_ar_commit & AR_PRESELECT_TRAIL) {
+		sx_assert(&audit_worker_sx, SA_XLOCKED);
+		audit_record_write(audit_vp, audit_cred, bsm->data, bsm->len);
+	}
 
 	if (ar->k_ar_commit & AR_PRESELECT_PIPE)
 		audit_pipe_submit(auid, event, class, sorf,
@@ -417,50 +369,39 @@ audit_worker_process_record(struct vnode *audit_vp, struct ucred *audit_cred,
 		    bsm->len);
 
 	kau_free(bsm);
+out:
+	if (trail_locked)
+		sx_xunlock(&audit_worker_sx);
 }
 
 /*
  * The audit_worker thread is responsible for watching the event queue,
  * dequeueing records, converting them to BSM format, and committing them to
  * disk.  In order to minimize lock thrashing, records are dequeued in sets
- * to a thread-local work queue.  In addition, the audit_work performs the
- * actual exchange of audit log vnode pointer, as audit_vp is a thread-local
- * variable.
+ * to a thread-local work queue.
+ *
+ * Note: this means that the effect bound on the size of the pending record
+ * queue is 2x the length of the global queue.
  */
 static void
 audit_worker(void *arg)
 {
 	struct kaudit_queue ar_worklist;
 	struct kaudit_record *ar;
-	struct ucred *audit_cred;
-	struct thread *audit_td;
-	struct vnode *audit_vp;
 	int lowater_signal;
 
-	/*
-	 * These are thread-local variables requiring no synchronization.
-	 */
 	TAILQ_INIT(&ar_worklist);
-	audit_cred = NULL;
-	audit_td = curthread;
-	audit_vp = NULL;
-
 	mtx_lock(&audit_mtx);
 	while (1) {
 		mtx_assert(&audit_mtx, MA_OWNED);
 
 		/*
-		 * Wait for record or rotation events.
+		 * Wait for a record.
 		 */
-		while (!audit_replacement_flag && TAILQ_EMPTY(&audit_q))
+		while (TAILQ_EMPTY(&audit_q))
 			cv_wait(&audit_worker_cv, &audit_mtx);
 
 		/*
-		 * First priority: replace the audit log target if requested.
-		 */
-		audit_worker_rotate(&audit_cred, &audit_vp, audit_td);
-
-		/*
 		 * If there are records in the global audit record queue,
 		 * transfer them to a thread-local queue and process them
 		 * one by one.  If we cross the low watermark threshold,
@@ -481,8 +422,7 @@ audit_worker(void *arg)
 		mtx_unlock(&audit_mtx);
 		while ((ar = TAILQ_FIRST(&ar_worklist))) {
 			TAILQ_REMOVE(&ar_worklist, ar, k_q);
-			audit_worker_process_record(audit_vp, audit_cred,
-			    audit_td, ar);
+			audit_worker_process_record(ar);
 			audit_free(ar);
 		}
 		mtx_lock(&audit_mtx);
@@ -492,50 +432,45 @@ audit_worker(void *arg)
 /*
  * audit_rotate_vnode() is called by a user or kernel thread to configure or
  * de-configure auditing on a vnode.  The arguments are the replacement
- * credential and vnode to substitute for the current credential and vnode,
- * if any.  If either is set to NULL, both should be NULL, and this is used
- * to indicate that audit is being disabled.  The real work is done in the
- * audit_worker thread, but audit_rotate_vnode() waits synchronously for that
- * to complete.
- *
- * The vnode should be referenced and opened by the caller.  The credential
- * should be referenced.  audit_rotate_vnode() will own both references as of
- * this call, so the caller should not release either.
- *
- * XXXAUDIT: Review synchronize communication logic.  Really, this is a
- * message queue of depth 1.  We are essentially acquiring ownership of the
- * communications queue, inserting our message, and waiting for an
- * acknowledgement.
+ * credential (referenced) and vnode (referenced and opened) to substitute
+ * for the current credential and vnode, if any.  If either is set to NULL,
+ * both should be NULL, and this is used to indicate that audit is being
+ * disabled.  Any previous cred/vnode will be closed and freed.  We re-enable
+ * generating rotation requests to auditd.
  */
 void
 audit_rotate_vnode(struct ucred *cred, struct vnode *vp)
 {
+	struct ucred *old_audit_cred;
+	struct vnode *old_audit_vp;
+	int vfslocked;
 
-	/*
-	 * If other parallel log replacements have been requested, we wait
-	 * until they've finished before continuing.
-	 */
-	mtx_lock(&audit_mtx);
-	while (audit_replacement_flag != 0)
-		cv_wait(&audit_replacement_cv, &audit_mtx);
-	audit_replacement_cred = cred;
-	audit_replacement_flag = 1;
-	audit_replacement_vp = vp;
+	KASSERT((cred != NULL && vp != NULL) || (cred == NULL && vp == NULL),
+	    ("audit_rotate_vnode: cred %p vp %p", cred, vp));
 
 	/*
-	 * Wake up the audit worker to perform the exchange once we release
-	 * the mutex.
+	 * Rotate the vnode/cred, and clear the rotate flag so that we will
+	 * send a rotate trigger if the new file fills.
 	 */
-	cv_signal(&audit_worker_cv);
+	sx_xlock(&audit_worker_sx);
+	old_audit_cred = audit_cred;
+	old_audit_vp = audit_vp;
+	audit_cred = cred;
+	audit_vp = vp;
+	audit_file_rotate_wait = 0;
+	audit_enabled = (audit_vp != NULL);
+	sx_xunlock(&audit_worker_sx);
 
 	/*
-	 * Wait for the audit_worker to broadcast that a replacement has
-	 * taken place; we know that once this has happened, our vnode has
-	 * been replaced in, so we can return successfully.
+	 * If there was an old vnode/credential, close and free.
 	 */
-	cv_wait(&audit_replacement_cv, &audit_mtx);
-	audit_file_rotate_wait = 0; /* We can now request another rotation */
-	mtx_unlock(&audit_mtx);
+	if (old_audit_vp != NULL) {
+		vfslocked = VFS_LOCK_GIANT(old_audit_vp->v_mount);
+		vn_close(old_audit_vp, AUDIT_CLOSE_FLAGS, old_audit_cred,
+		    curthread);
+		VFS_UNLOCK_GIANT(vfslocked);
+		crfree(old_audit_cred);
+	}
 }
 
 void
@@ -543,7 +478,7 @@ audit_worker_init(void)
 {
 	int error;
 
-	cv_init(&audit_replacement_cv, "audit_replacement_cv");
+	sx_init(&audit_worker_sx, "audit_worker_sx");
 	error = kproc_create(audit_worker, NULL, &audit_thread, RFHIGHPID,
 	    0, "audit");
 	if (error)