PERFORCE change 80064 for review

Wed Jul 13 11:03:33 GMT 2005

http://perforce.freebsd.org/chv.cgi?CH=80064

Change 80064 by rwatson at rwatson_paprika on 2005/07/13 11:03:29

	       A number of annotations for consideration in future cleanup:
	
	       - Might be good to use UMA for per-thread and per-record storage,
	         rather than malloc.
	
	       - Active credential and vnode are thread-local variables.
	
	       - Annotate some error choices.
	
	       - Minor style tweaks.
	
	       - Annotate odd, fixable, improper, or otherwise interesting use of
	         error handling.  FreeBSD does not permit M_WAITOK to fail for some    
	         memory types, and that lack of that failure can clean up quite a
	         bit of logic here.
	
	       - Annotate possible spots to remove Giant due to MPSAFE VFS. 
	         Annotate lack of locking in the netinet interactions.
	
	       - Comment about calculation of free space.
	
	       - Summary comments for audit_worker(), audit_init(),
	         audit_rotate_vnode(), currecord(), audit_syscall_enter(),
	         audit_syscall_exit(), as well as potential issues with them.
	
	       - Comments about places to possibly add assertions.

Affected files ...

.. //depot/projects/trustedbsd/audit3/sys/security/audit/kern_audit.c#26 edit

Differences ...

==== //depot/projects/trustedbsd/audit3/sys/security/audit/kern_audit.c#26 (text+ko) ====

@@ -55,6 +55,10 @@
 #include <bsm/audit_kernel.h>
 #include <security/audit/audit_klib.h>
 
+/*
+ * XXXAUDIT: Might be nice to use a UMA zone for records, and malloc(9) only
+ * for things like triggers, temporary storage, etc.
+ */
 MALLOC_DEFINE(M_AUDIT, "audit", "Audit event records");
 
 #ifdef AUDIT
@@ -134,6 +138,8 @@
  * replacement.  When a replacement is completed, this cv is signalled
  * by the worker thread so a waiting thread can start another replacement.
  * We also store a credential to perform audit log write operations with.
+ *
+ * The current credential and vnode are thread-local to audit_worker.
  */
 static struct cv			audit_replacement_cv;
 
@@ -211,6 +217,9 @@
 		error = 0;
 		audit_isopen = 1;
 	} else {
+		/*
+		 * XXXRW: Why not EBUSY?
+		 */
 		error = EOPNOTSUPP;
 	}
 	mtx_unlock(&audit_mtx);
@@ -263,10 +272,15 @@
 static int
 audit_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
+
 	/* Communication is kernel->userspace only */
 	return EOPNOTSUPP;
 }
 
+/*
+ * XXXAUDIT: Since we can't fail this call, we should not have a return
+ * value.
+ */
 static int
 send_trigger(unsigned int trigger)
 {
@@ -276,6 +290,9 @@
 	if (!audit_isopen)
 		return 0;
 
+	/*
+	 * XXXAUDIT: Use a condition variable instead of msleep/wakeup?
+	 */
 	ti = malloc(sizeof *ti, M_AUDIT, M_WAITOK);
 	mtx_lock(&audit_mtx);
 	ti->trigger = trigger;
@@ -294,9 +311,13 @@
 	.d_name = 	"audit"
 };
 
+/*
+ * XXXAUDIT: For consistency, perhaps audit_record_free()?
+ */
 static void
 audit_free(struct kaudit_record *ar)
 {
+
 	if (ar->k_ar.ar_arg_upath1 != NULL) {
 		free(ar->k_ar.ar_arg_upath1, M_AUDIT);
 	}
@@ -318,6 +339,13 @@
 	free(ar, M_AUDIT);
 }
 
+/*
+ * XXXAUDIT: Should adjust comments below to make it clear that we get to
+ * this point only if we believe we have storage, so not having space here
+ * is a violation of invariants derived from administrative procedures.
+ * I.e., someone else has written to the audit partition, leaving less space
+ * than we accounted for.
+ */
 static int
 audit_record_write(struct vnode *vp, struct kaudit_record *ar, 
     struct ucred *cred, struct thread *td)
@@ -328,6 +356,9 @@
 	struct vattr vattr;
 	struct statfs *mnt_stat = &vp->v_mount->mnt_stat;
 
+	/*
+	 * XXXAUDIT: In the world of MPSAFE VFS, this may not be necessary.
+	 */
 	mtx_assert(&Giant, MA_OWNED);
 
 	/*
@@ -385,6 +416,8 @@
 		/* 
 		 * Send a message to the audit daemon that disk space 
 		 * is getting low.
+		 *
+		 * XXXAUDIT: Check math and block size calculation here.
 		 */
 		if (audit_qctrl.aq_minfree != 0) {
 			temp = mnt_stat->f_blocks / (100 / 
@@ -463,6 +496,10 @@
 		goto out;
 	}
 
+	/*
+	 * XXXAUDIT: Should we actually allow this conversion to fail?  With
+	 * sleeping memory allocation and invariants checks, perhaps not.
+	 */
 	ret = kaudit_to_bsm(ar, &bsm);
 	if (ret == BSM_NOAUDIT) {
 		ret = 0;
@@ -486,6 +523,8 @@
 	 * away from the BSM record generation and have the BSM generation
 	 * done before this function is called. This function will then
 	 * take the BSM record as a parameter.
+	 *
+	 * XXXRW: In the new world order, this is no longer true.
 	 */
 	ret = (vn_rdwr(UIO_WRITE, vp, (void *)bsm->data, bsm->len,
 	    (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL, NULL, td));
@@ -510,6 +549,16 @@
 	return (ret);
 }
 
+/*
+ * The audit_worker thread is responsible for watching the event queue,
+ * dequeueing records, converting them to BSM format, and committing them to
+ * disk.  In order to minimize lock thrashing, records are dequeued in sets
+ * to a thread-local work queue.  In addition, the audit_work performs the
+ * actual exchange of audit log vnode pointer, as audit_vp is a thread-local
+ * variable.
+ *
+ * XXXAUDIT: Giant is now less required here.
+ */
 static void
 audit_worker(void *arg)
 {
@@ -649,6 +698,8 @@
 		 * with the audit_mtx held, to avoid a lock order reversal
 		 * as free() may grab Giant.  This should be fixed at
 		 * some point.
+		 *
+		 * XXXAUDIT: free() no longer grabs Giant.
 		 */
 		while ((ar = TAILQ_FIRST(&audit_q))) {
 			TAILQ_REMOVE(&audit_q, ar, k_q);
@@ -691,6 +742,11 @@
 	}
 }
 
+/*
+ * Initialize the Audit subsystem: configuration state, work queue,
+ * synchronization primitives, worker thread, and trigger device node.  Also
+ * call into the BSM assembly code to initialize it.
+ */
 void
 audit_init(void)
 {
@@ -735,6 +791,26 @@
 
 SYSINIT(audit_init, SI_SUB_AUDIT, SI_ORDER_FIRST, audit_init, NULL)
 
+/*
+ * audit_rotate_vnode() is called by a user or kernel thread to configure or
+ * de-configure auditing on a vnode.  The arguments are the replacement
+ * credential and vnode to substitute for the current credential and vnode,
+ * if any.  If either is set to NULL, both should be NULL, and this is used
+ * to indicate that audit is being disabled.  The real work is done in the
+ * audit_worker thread, but audit_rotate_vnode() waits synchronously for that
+ * to complete.
+ *
+ * The vnode should be referenced and opened by the caller.  The credential
+ * should be referenced.  audit_rotate_vnode() will own both references as of
+ * this call, so the caller should not release either.
+ *
+ * XXXAUDIT: Review synchronize communication logic.  Really, this is a
+ * message queue of depth 1.
+ *
+ * XXXAUDIT: Enhance the comments below to indicate that we are basically
+ * acquiring ownership of the communications queue, inserting our message,
+ * and waiting for an acknowledgement.
+ */
 static void
 audit_rotate_vnode(struct ucred *cred, struct vnode *vp)
 {
@@ -775,6 +851,8 @@
 
 	/* XXX  Need to figure out how the kernel->userspace file full
 	 * signalling will take place.
+	 *
+	 * XXXAUDIT: This comment may now be obsolete.
 	 */
 	audit_file_rotate_wait = 0; /* We can now request another rotation */
 }
@@ -785,12 +863,17 @@
 void
 audit_shutdown(void)
 {
+
 	audit_rotate_vnode(NULL, NULL);
 }
 
+/*
+ * Return the current thread's audit record, if any.
+ */
 static __inline__ struct kaudit_record *
 currecord(void)
 {
+
 	return (curthread->td_ar);
 }
 
@@ -833,9 +916,12 @@
 		/* This is not very efficient; we're required to allocate
 		 * a complete kernel audit record just so the user record
 		 * can tag along.
+		 *
+		 * XXXAUDIT: Maybe AUE_AUDIT in the system call context and
+		 * special pre-select handling?
 		 */
 		td->td_ar = audit_new(AUE_NULL, td);
-		if (td->td_ar == NULL) /*auditing not on, or memory error */
+		if (td->td_ar == NULL)
 			return (ENOTSUP);
 		ar = td->td_ar;
 	}
@@ -858,6 +944,9 @@
 	/* Attach the user audit record to the kernel audit record. Because
 	 * this system call is an auditable event, we will write the user
 	 * record along with the record for this audit event.
+	 *
+	 * XXXAUDIT: KASSERT appropriate starting values of k_udata, k_ulen,
+	 * k_ar_commit & AR_COMMIT_USER?
 	 */
 	ar->k_udata = rec;
 	ar->k_ulen  = uap->length;
@@ -921,6 +1010,8 @@
 
 	/* XXX Need to implement these commands by accessing the global
 	 * values associated with the commands.
+	 *
+	 * XXXAUDIT: Locking?
 	 */
 	switch (uap->cmd) {
 	case A_GETPOLICY:
@@ -1004,6 +1095,8 @@
 	case A_GETPINFO:
 		if (udata.au_aupinfo.ap_pid < 1) 
 			return (EINVAL);
+
+		/* XXXAUDIT: p_cansee()? */
 		if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL)
 			return (EINVAL);
 
@@ -1022,6 +1115,8 @@
 	case A_SETPMASK:
 		if (udata.au_aupinfo.ap_pid < 1) 
 			return (EINVAL);
+
+		/* XXXAUDIT: p_cansee()? */
 		if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL)
 			return (EINVAL);
 
@@ -1129,6 +1224,10 @@
 	/*
 	 * XXX:
 	 * Integer write on static pointer dereference: doesn't need locking?
+	 *
+	 * XXXAUDIT: Might need locking to serialize audit events in the same
+	 * order as change events?  Or maybe that's an under-solveable
+	 * problem.
 	 */     
 	PROC_LOCK(td->td_proc);
 	td->td_proc->p_au->ai_auid = id;
@@ -1237,10 +1336,15 @@
 	 * If a path is specified, open the replacement vnode, perform
 	 * validity checks, and grab another reference to the current
 	 * credential.
+	 *
+	 * XXXAUDIT: On Darwin, a NULL path is used to disable audit.
 	 */
 	if (uap->path == NULL)
 		return (EINVAL);
 
+	/*
+	 * XXXAUDIT: Giant may no longer be required here.
+	 */
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, td);
 	flags = audit_open_flags;
@@ -1259,6 +1363,11 @@
 	}
 	cred = td->td_ucred;
 	crhold(cred);
+
+	/*
+	 * XXXAUDIT: Should audit_suspended actually be cleared by
+	 * audit_worker?
+	 */
 	audit_suspended = 0;
 
 	mtx_unlock(&Giant);
@@ -1274,6 +1383,13 @@
 
 /*
  * MPSAFE
+ *
+ * XXXAUDIT: There are a number of races present in the code below due to
+ * release and re-grab of the mutex.  The code should be revised to become
+ * slightly less racy.
+ *
+ * XXXAUDIT: Shouldn't there be logic here to sleep waiting on available
+ * pre_q space, suspending the system call until there is room?
  */
 struct kaudit_record *
 audit_new(int event, struct thread *td)
@@ -1306,7 +1422,7 @@
 	 * XXX: We may want to fail-stop if allocation fails.
 	 * XXX: The number of outstanding uncommitted audit records is
 	 * limited by the number of concurrent threads servicing system
-	 * calls in the kernel.  
+	 * calls in the kernel.
 	 */
 
 	ar = malloc(sizeof(*ar), M_AUDIT, M_WAITOK);
@@ -1322,7 +1438,12 @@
 	ar->k_ar.ar_event = event;
 	nanotime(&ar->k_ar.ar_starttime);
 
-	/* Export the subject credential. */
+	/*
+	 * Export the subject credential.
+	 *
+	 * XXXAUDIT: td_ucred access is OK without proc lock, but some other
+	 * fields here may require the proc lock.
+	 */
 	cru2x(td->td_ucred, &ar->k_ar.ar_subj_cred);
 	ar->k_ar.ar_subj_ruid = td->td_ucred->cr_ruid;
 	ar->k_ar.ar_subj_rgid = td->td_ucred->cr_rgid;
@@ -1332,6 +1453,7 @@
 	ar->k_ar.ar_subj_pid = td->td_proc->p_pid;
 	ar->k_ar.ar_subj_amask = td->td_proc->p_au->ai_mask;
 	ar->k_ar.ar_subj_term = td->td_proc->p_au->ai_termid;
+
 	bcopy(td->td_proc->p_comm, ar->k_ar.ar_subj_comm, MAXCOMLEN);
 
 	return (ar);
@@ -1366,7 +1488,9 @@
 	/*
 	 * Decide whether to commit the audit record by checking the
 	 * error value from the system call and using the appropriate
-	 * audit mask. 
+	 * audit mask.
+	 *
+	 * XXXAUDIT: Synchronize access to audit_nae_mask?
 	 */
 	if (ar->k_ar.ar_subj_auid == AU_DEFAUDITID)
 		aumask = &audit_nae_mask;
@@ -1457,8 +1581,10 @@
 }
 
 /*
- * Calls to set up and tear down audit structures associated with
- * each system call.
+ * audit_syscall_enter() is called on entry to eatch system call.  It is
+ * responsible for deciding whether or not to audit the call (preselection),
+ * and if so, allocating a per-thread audit record.  audit_new() will fill in
+ * basic thread/credential properties.
  */
 void
 audit_syscall_enter(unsigned short code, struct thread *td)
@@ -1466,12 +1592,17 @@
 	int audit_event;
 	struct au_mask *aumask;
 
+	KASSERT(td->td_ar == NULL, ("audit_syscall_enter: td->td_ar != NULL"));
+
 	/*
 	 * In FreeBSD, each ABI has its own system call table, and hence
 	 * mapping of system call codes to audit events.  Convert the code to
 	 * an audit event identifier using the process system call table
 	 * reference.  In Darwin, there's only one, so we use the global
 	 * symbol for the system call table.
+	 *
+	 * XXXAUDIT: Should we audit that a bad system call was made, and if
+	 * so, how?
 	 */
 	if (code >= td->td_proc->p_sysent->sv_size)
 		return;
@@ -1480,9 +1611,8 @@
 	if (audit_event == AUE_NULL)
 		return;
 
-	KASSERT(td->td_ar == NULL, ("audit_syscall_enter: td->td_ar != NULL"));
-
-	/* Check which audit mask to use; either the kernel non-attributable
+	/*
+	 * Check which audit mask to use; either the kernel non-attributable
 	 * event mask or the process audit mask.
 	 */
 	if (td->td_proc->p_au->ai_auid == AU_DEFAUDITID)
@@ -1500,6 +1630,13 @@
 		 * If we're out of space and need to suspend unprivileged
 		 * processes, do that here rather than trying to allocate
 		 * another audit record.
+		 *
+		 * XXXRW: We might wish to be able to continue here in the
+		 * future, if the system recovers.  That should be possible
+		 * by means of checking the condition in a loop around
+		 * cv_wait().  It might be desirable to reevaluate whether an
+		 * audit record is still required for this event by
+		 * re-calling au_preselect().
 		 */
 		if (audit_in_failure && suser(td) != 0) {
 			cv_wait(&audit_fail_cv, &audit_mtx);
@@ -1510,6 +1647,11 @@
 		td->td_ar = NULL;
 }
 
+/*
+ * audit_syscall_exit() is called from the return of every system call, or in
+ * the event of exit1(), during the execution of exit1().  It is responsible
+ * for committing the audit record, if any, along with return condition.
+ */
 void
 audit_syscall_exit(int error, struct thread *td)
 {
@@ -1543,6 +1685,9 @@
  * check the thread audit record pointer anyway, as the audit condition
  * could change, and pre-selection may not have allocated an audit
  * record for this event.
+ *
+ * XXXAUDIT: Should we assert, in each case, that this field of the record
+ * hasn't already been filled in?
  */
 void
 audit_arg_addr(void * addr)
@@ -1768,7 +1913,9 @@
 	if ((ar == NULL) || (p == NULL))
 		return;
 
-	/* XXX May need to lock the credentials structures */
+	/*
+	 * XXXAUDIT: PROC_LOCK_ASSERT(p);
+	 */
 	ar->k_ar.ar_arg_auid = p->p_au->ai_auid;
 	ar->k_ar.ar_arg_euid = p->p_ucred->cr_uid;
 	ar->k_ar.ar_arg_egid = p->p_ucred->cr_groups[0];
@@ -1811,6 +1958,10 @@
 	ar->k_ar.ar_valid_arg |= ARG_SOCKINFO;
 }
 
+/*
+ * XXXAUDIT: Argument here should be 'sa' not 'so'.  Caller is responsible
+ * for synchronizing access to the source of the address.
+ */
 void
 audit_arg_sockaddr(struct thread *td, struct sockaddr *so)
 {
@@ -1820,9 +1971,6 @@
 	if (ar == NULL || td == NULL || so == NULL)
 		return;
 
-	/*
-	 * XXX: Do we need to lock the socket?
-	 */
 	bcopy(so, &ar->k_ar.ar_arg_sockaddr, sizeof(ar->k_ar.ar_arg_sockaddr));
 	switch (so->sa_family) {
 	case AF_INET:
@@ -1836,6 +1984,7 @@
 				ARG_UPATH1);
 		ar->k_ar.ar_valid_arg |= ARG_SADDRUNIX;
 		break;
+	/* XXXAUDIT: default:? */
 	}
 }
 
@@ -1997,6 +2146,9 @@
 	struct socket *so;
 	struct inpcb *pcb;
 
+	/*
+	 * XXXAUDIT: Why is the (ar == NULL) test only in the socket case?
+	 */
 	if (fp->f_type == DTYPE_VNODE) {
 		audit_arg_vnpath((struct vnode *)fp->f_data, ARG_VNODE1);
 		return;
@@ -2006,6 +2158,10 @@
 		ar = currecord();
 		if (ar == NULL)
 			return;
+
+		/*
+		 * XXXAUDIT: Socket locking?  Inpcb locking?
+		 */
 		so = (struct socket *)fp->f_data;
 		if (INP_CHECK_SOCKAF(so, PF_INET)) {
 			if (so->so_pcb == NULL)
@@ -2027,6 +2183,7 @@
 				pcb->inp_lport;
 			ar->k_ar.ar_valid_arg |= ARG_SOCKINFO;
 		}
+		/* XXXAUDIT: else? */
 	}
 
 }
@@ -2041,6 +2198,7 @@
 	KASSERT(p->p_au == NULL, ("audit_proc_alloc: p->p_au != NULL (%d)",
 	    p->p_pid));
 	p->p_au = malloc(sizeof(*(p->p_au)), M_AUDIT, M_WAITOK);
+	/* XXXAUDIT: Zero?  Slab allocate? */
 	//printf("audit_proc_alloc: pid %d p_au %p\n", p->p_pid, p->p_au);
 }
 
@@ -2089,6 +2247,10 @@
 	//printf("audit_proc_fork: child pid %d p_au %p\n", child->p_pid,
 	//    child->p_au);
 	bcopy(parent->p_au, child->p_au, sizeof(*child->p_au));
+	/*
+	 * XXXAUDIT: Zero pointers to external memory, or assert they are
+	 * zero?
+	 */
 }
 
 /*
@@ -2100,6 +2262,9 @@
 
 	KASSERT(p->p_au != NULL, ("p->p_au == NULL (%d)", p->p_pid));
 	//printf("audit_proc_free: pid %d p_au %p\n", p->p_pid, p->p_au);
+	/*
+	 * XXXAUDIT: Assert that external memory pointers are NULL?
+	 */
 	free(p->p_au, M_AUDIT);
 	p->p_au = NULL;
 }
@@ -2109,6 +2274,8 @@
  * record stored on the user thread. This function will allocate the memory to 
  * store the path info if not already available. This memory will be 
  * freed when the audit record is freed.
+ *
+ * XXXAUDIT: Possibly assert that the memory isn't already allocated?
  */
 void
 audit_arg_upath(struct thread *td, char *upath, u_int64_t flags)
@@ -2119,18 +2286,24 @@
 	if (td == NULL || upath == NULL) 
 		return;		/* nothing to do! */
 
+	/*
+	 * XXXAUDIT: Witness warning for possible sleep here?
+	 */
+
+	/*
+	 * XXXAUDIT: KASSERT argument validity?
+	 */
 	if ((flags & (ARG_UPATH1 | ARG_UPATH2)) == 0)
 		return;
 
 	ar = currecord();
-	if (ar == NULL)	/* This will be the case for unaudited system calls */
+	if (ar == NULL)
 		return;
 
 	if (flags & ARG_UPATH1) {
 		ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_UPATH1);
 		pathp = &ar->k_ar.ar_arg_upath1;
-	}
-	else {
+	} else {
 		ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_UPATH2);
 		pathp = &ar->k_ar.ar_arg_upath2;
 	}
@@ -2158,6 +2331,8 @@
  * XXX: We should accept the process argument from the caller, since
  * it's very likely they already have a reference.
  * XXX: Error handling in this function is poor.
+ *
+ * XXXAUDIT: Possibly KASSERT the path pointer is NULL?
  */
 void
 audit_arg_vnpath(struct vnode *vp, u_int64_t flags)
@@ -2169,9 +2344,15 @@
 	struct vnode_au_info *vnp;
 	struct thread *td;
 
+	/*
+	 * XXXAUDIT: Why is vp possibly NULL here?
+	 */
 	if (vp == NULL)
 		return;
 
+	/*
+	 * XXXAUDIT: Less Giant needed here.
+	 */
 	mtx_assert(&Giant, MA_OWNED);
 	ASSERT_VOP_LOCKED(vp, "audit_arg_vnpath")
 
@@ -2179,6 +2360,9 @@
 	if (ar == NULL)	/* This will be the case for unaudited system calls */
 		return;
 
+	/*
+	 * XXXAUDIT: KASSERT argument validity instead?
+	 */
 	if ((flags & (ARG_VNODE1 | ARG_VNODE2)) == 0)
 		return;
 
@@ -2189,8 +2373,7 @@
 		ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE1);
 		pathp = &ar->k_ar.ar_arg_kpath1;
 		vnp = &ar->k_ar.ar_arg_vnode1;
-	}
-	else {
+	} else {
 		ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_KPATH2);
 		ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE2);
 		pathp = &ar->k_ar.ar_arg_kpath2;
To Unsubscribe: send mail to majordomo at trustedbsd.org
with "unsubscribe trustedbsd-cvs" in the body of the message