PERFORCE change 80064 for review
Robert Watson
rwatson at FreeBSD.org
Wed Jul 13 11:03:33 GMT 2005
http://perforce.freebsd.org/chv.cgi?CH=80064
Change 80064 by rwatson at rwatson_paprika on 2005/07/13 11:03:29
A number of annotations for consideration in future cleanup:
- Might be good to use UMA for per-thread and per-record storage,
rather than malloc.
- Active credential and vnode are thread-local variables.
- Annotate some error choices.
- Minor style tweaks.
- Annotate odd, fixable, improper, or otherwise interesting use of
error handling. FreeBSD does not permit M_WAITOK to fail for some
memory types, and that lack of that failure can clean up quite a
bit of logic here.
- Annotate possible spots to remove Giant due to MPSAFE VFS.
Annotate lack of locking in the netinet interactions.
- Comment about calculation of free space.
- Summary comments for audit_worker(), audit_init(),
audit_rotate_vnode(), currecord(), audit_syscall_enter(),
audit_syscall_exit(), as well as potential issues with them.
- Comments about places to possibly add assertions.
Affected files ...
.. //depot/projects/trustedbsd/audit3/sys/security/audit/kern_audit.c#26 edit
Differences ...
==== //depot/projects/trustedbsd/audit3/sys/security/audit/kern_audit.c#26 (text+ko) ====
@@ -55,6 +55,10 @@
#include <bsm/audit_kernel.h>
#include <security/audit/audit_klib.h>
+/*
+ * XXXAUDIT: Might be nice to use a UMA zone for records, and malloc(9) only
+ * for things like triggers, temporary storage, etc.
+ */
MALLOC_DEFINE(M_AUDIT, "audit", "Audit event records");
#ifdef AUDIT
@@ -134,6 +138,8 @@
* replacement. When a replacement is completed, this cv is signalled
* by the worker thread so a waiting thread can start another replacement.
* We also store a credential to perform audit log write operations with.
+ *
+ * The current credential and vnode are thread-local to audit_worker.
*/
static struct cv audit_replacement_cv;
@@ -211,6 +217,9 @@
error = 0;
audit_isopen = 1;
} else {
+ /*
+ * XXXRW: Why not EBUSY?
+ */
error = EOPNOTSUPP;
}
mtx_unlock(&audit_mtx);
@@ -263,10 +272,15 @@
static int
audit_write(struct cdev *dev, struct uio *uio, int ioflag)
{
+
/* Communication is kernel->userspace only */
return EOPNOTSUPP;
}
+/*
+ * XXXAUDIT: Since we can't fail this call, we should not have a return
+ * value.
+ */
static int
send_trigger(unsigned int trigger)
{
@@ -276,6 +290,9 @@
if (!audit_isopen)
return 0;
+ /*
+ * XXXAUDIT: Use a condition variable instead of msleep/wakeup?
+ */
ti = malloc(sizeof *ti, M_AUDIT, M_WAITOK);
mtx_lock(&audit_mtx);
ti->trigger = trigger;
@@ -294,9 +311,13 @@
.d_name = "audit"
};
+/*
+ * XXXAUDIT: For consistency, perhaps audit_record_free()?
+ */
static void
audit_free(struct kaudit_record *ar)
{
+
if (ar->k_ar.ar_arg_upath1 != NULL) {
free(ar->k_ar.ar_arg_upath1, M_AUDIT);
}
@@ -318,6 +339,13 @@
free(ar, M_AUDIT);
}
+/*
+ * XXXAUDIT: Should adjust comments below to make it clear that we get to
+ * this point only if we believe we have storage, so not having space here
+ * is a violation of invariants derived from administrative procedures.
+ * I.e., someone else has written to the audit partition, leaving less space
+ * than we accounted for.
+ */
static int
audit_record_write(struct vnode *vp, struct kaudit_record *ar,
struct ucred *cred, struct thread *td)
@@ -328,6 +356,9 @@
struct vattr vattr;
struct statfs *mnt_stat = &vp->v_mount->mnt_stat;
+ /*
+ * XXXAUDIT: In the world of MPSAFE VFS, this may not be necessary.
+ */
mtx_assert(&Giant, MA_OWNED);
/*
@@ -385,6 +416,8 @@
/*
* Send a message to the audit daemon that disk space
* is getting low.
+ *
+ * XXXAUDIT: Check math and block size calculation here.
*/
if (audit_qctrl.aq_minfree != 0) {
temp = mnt_stat->f_blocks / (100 /
@@ -463,6 +496,10 @@
goto out;
}
+ /*
+ * XXXAUDIT: Should we actually allow this conversion to fail? With
+ * sleeping memory allocation and invariants checks, perhaps not.
+ */
ret = kaudit_to_bsm(ar, &bsm);
if (ret == BSM_NOAUDIT) {
ret = 0;
@@ -486,6 +523,8 @@
* away from the BSM record generation and have the BSM generation
* done before this function is called. This function will then
* take the BSM record as a parameter.
+ *
+ * XXXRW: In the new world order, this is no longer true.
*/
ret = (vn_rdwr(UIO_WRITE, vp, (void *)bsm->data, bsm->len,
(off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL, NULL, td));
@@ -510,6 +549,16 @@
return (ret);
}
+/*
+ * The audit_worker thread is responsible for watching the event queue,
+ * dequeueing records, converting them to BSM format, and committing them to
+ * disk. In order to minimize lock thrashing, records are dequeued in sets
+ * to a thread-local work queue. In addition, the audit_work performs the
+ * actual exchange of audit log vnode pointer, as audit_vp is a thread-local
+ * variable.
+ *
+ * XXXAUDIT: Giant is now less required here.
+ */
static void
audit_worker(void *arg)
{
@@ -649,6 +698,8 @@
* with the audit_mtx held, to avoid a lock order reversal
* as free() may grab Giant. This should be fixed at
* some point.
+ *
+ * XXXAUDIT: free() no longer grabs Giant.
*/
while ((ar = TAILQ_FIRST(&audit_q))) {
TAILQ_REMOVE(&audit_q, ar, k_q);
@@ -691,6 +742,11 @@
}
}
+/*
+ * Initialize the Audit subsystem: configuration state, work queue,
+ * synchronization primitives, worker thread, and trigger device node. Also
+ * call into the BSM assembly code to initialize it.
+ */
void
audit_init(void)
{
@@ -735,6 +791,26 @@
SYSINIT(audit_init, SI_SUB_AUDIT, SI_ORDER_FIRST, audit_init, NULL)
+/*
+ * audit_rotate_vnode() is called by a user or kernel thread to configure or
+ * de-configure auditing on a vnode. The arguments are the replacement
+ * credential and vnode to substitute for the current credential and vnode,
+ * if any. If either is set to NULL, both should be NULL, and this is used
+ * to indicate that audit is being disabled. The real work is done in the
+ * audit_worker thread, but audit_rotate_vnode() waits synchronously for that
+ * to complete.
+ *
+ * The vnode should be referenced and opened by the caller. The credential
+ * should be referenced. audit_rotate_vnode() will own both references as of
+ * this call, so the caller should not release either.
+ *
+ * XXXAUDIT: Review synchronize communication logic. Really, this is a
+ * message queue of depth 1.
+ *
+ * XXXAUDIT: Enhance the comments below to indicate that we are basically
+ * acquiring ownership of the communications queue, inserting our message,
+ * and waiting for an acknowledgement.
+ */
static void
audit_rotate_vnode(struct ucred *cred, struct vnode *vp)
{
@@ -775,6 +851,8 @@
/* XXX Need to figure out how the kernel->userspace file full
* signalling will take place.
+ *
+ * XXXAUDIT: This comment may now be obsolete.
*/
audit_file_rotate_wait = 0; /* We can now request another rotation */
}
@@ -785,12 +863,17 @@
void
audit_shutdown(void)
{
+
audit_rotate_vnode(NULL, NULL);
}
+/*
+ * Return the current thread's audit record, if any.
+ */
static __inline__ struct kaudit_record *
currecord(void)
{
+
return (curthread->td_ar);
}
@@ -833,9 +916,12 @@
/* This is not very efficient; we're required to allocate
* a complete kernel audit record just so the user record
* can tag along.
+ *
+ * XXXAUDIT: Maybe AUE_AUDIT in the system call context and
+ * special pre-select handling?
*/
td->td_ar = audit_new(AUE_NULL, td);
- if (td->td_ar == NULL) /*auditing not on, or memory error */
+ if (td->td_ar == NULL)
return (ENOTSUP);
ar = td->td_ar;
}
@@ -858,6 +944,9 @@
/* Attach the user audit record to the kernel audit record. Because
* this system call is an auditable event, we will write the user
* record along with the record for this audit event.
+ *
+ * XXXAUDIT: KASSERT appropriate starting values of k_udata, k_ulen,
+ * k_ar_commit & AR_COMMIT_USER?
*/
ar->k_udata = rec;
ar->k_ulen = uap->length;
@@ -921,6 +1010,8 @@
/* XXX Need to implement these commands by accessing the global
* values associated with the commands.
+ *
+ * XXXAUDIT: Locking?
*/
switch (uap->cmd) {
case A_GETPOLICY:
@@ -1004,6 +1095,8 @@
case A_GETPINFO:
if (udata.au_aupinfo.ap_pid < 1)
return (EINVAL);
+
+ /* XXXAUDIT: p_cansee()? */
if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL)
return (EINVAL);
@@ -1022,6 +1115,8 @@
case A_SETPMASK:
if (udata.au_aupinfo.ap_pid < 1)
return (EINVAL);
+
+ /* XXXAUDIT: p_cansee()? */
if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL)
return (EINVAL);
@@ -1129,6 +1224,10 @@
/*
* XXX:
* Integer write on static pointer dereference: doesn't need locking?
+ *
+ * XXXAUDIT: Might need locking to serialize audit events in the same
+ * order as change events? Or maybe that's an under-solveable
+ * problem.
*/
PROC_LOCK(td->td_proc);
td->td_proc->p_au->ai_auid = id;
@@ -1237,10 +1336,15 @@
* If a path is specified, open the replacement vnode, perform
* validity checks, and grab another reference to the current
* credential.
+ *
+ * XXXAUDIT: On Darwin, a NULL path is used to disable audit.
*/
if (uap->path == NULL)
return (EINVAL);
+ /*
+ * XXXAUDIT: Giant may no longer be required here.
+ */
mtx_lock(&Giant);
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, td);
flags = audit_open_flags;
@@ -1259,6 +1363,11 @@
}
cred = td->td_ucred;
crhold(cred);
+
+ /*
+ * XXXAUDIT: Should audit_suspended actually be cleared by
+ * audit_worker?
+ */
audit_suspended = 0;
mtx_unlock(&Giant);
@@ -1274,6 +1383,13 @@
/*
* MPSAFE
+ *
+ * XXXAUDIT: There are a number of races present in the code below due to
+ * release and re-grab of the mutex. The code should be revised to become
+ * slightly less racy.
+ *
+ * XXXAUDIT: Shouldn't there be logic here to sleep waiting on available
+ * pre_q space, suspending the system call until there is room?
*/
struct kaudit_record *
audit_new(int event, struct thread *td)
@@ -1306,7 +1422,7 @@
* XXX: We may want to fail-stop if allocation fails.
* XXX: The number of outstanding uncommitted audit records is
* limited by the number of concurrent threads servicing system
- * calls in the kernel.
+ * calls in the kernel.
*/
ar = malloc(sizeof(*ar), M_AUDIT, M_WAITOK);
@@ -1322,7 +1438,12 @@
ar->k_ar.ar_event = event;
nanotime(&ar->k_ar.ar_starttime);
- /* Export the subject credential. */
+ /*
+ * Export the subject credential.
+ *
+ * XXXAUDIT: td_ucred access is OK without proc lock, but some other
+ * fields here may require the proc lock.
+ */
cru2x(td->td_ucred, &ar->k_ar.ar_subj_cred);
ar->k_ar.ar_subj_ruid = td->td_ucred->cr_ruid;
ar->k_ar.ar_subj_rgid = td->td_ucred->cr_rgid;
@@ -1332,6 +1453,7 @@
ar->k_ar.ar_subj_pid = td->td_proc->p_pid;
ar->k_ar.ar_subj_amask = td->td_proc->p_au->ai_mask;
ar->k_ar.ar_subj_term = td->td_proc->p_au->ai_termid;
+
bcopy(td->td_proc->p_comm, ar->k_ar.ar_subj_comm, MAXCOMLEN);
return (ar);
@@ -1366,7 +1488,9 @@
/*
* Decide whether to commit the audit record by checking the
* error value from the system call and using the appropriate
- * audit mask.
+ * audit mask.
+ *
+ * XXXAUDIT: Synchronize access to audit_nae_mask?
*/
if (ar->k_ar.ar_subj_auid == AU_DEFAUDITID)
aumask = &audit_nae_mask;
@@ -1457,8 +1581,10 @@
}
/*
- * Calls to set up and tear down audit structures associated with
- * each system call.
+ * audit_syscall_enter() is called on entry to eatch system call. It is
+ * responsible for deciding whether or not to audit the call (preselection),
+ * and if so, allocating a per-thread audit record. audit_new() will fill in
+ * basic thread/credential properties.
*/
void
audit_syscall_enter(unsigned short code, struct thread *td)
@@ -1466,12 +1592,17 @@
int audit_event;
struct au_mask *aumask;
+ KASSERT(td->td_ar == NULL, ("audit_syscall_enter: td->td_ar != NULL"));
+
/*
* In FreeBSD, each ABI has its own system call table, and hence
* mapping of system call codes to audit events. Convert the code to
* an audit event identifier using the process system call table
* reference. In Darwin, there's only one, so we use the global
* symbol for the system call table.
+ *
+ * XXXAUDIT: Should we audit that a bad system call was made, and if
+ * so, how?
*/
if (code >= td->td_proc->p_sysent->sv_size)
return;
@@ -1480,9 +1611,8 @@
if (audit_event == AUE_NULL)
return;
- KASSERT(td->td_ar == NULL, ("audit_syscall_enter: td->td_ar != NULL"));
-
- /* Check which audit mask to use; either the kernel non-attributable
+ /*
+ * Check which audit mask to use; either the kernel non-attributable
* event mask or the process audit mask.
*/
if (td->td_proc->p_au->ai_auid == AU_DEFAUDITID)
@@ -1500,6 +1630,13 @@
* If we're out of space and need to suspend unprivileged
* processes, do that here rather than trying to allocate
* another audit record.
+ *
+ * XXXRW: We might wish to be able to continue here in the
+ * future, if the system recovers. That should be possible
+ * by means of checking the condition in a loop around
+ * cv_wait(). It might be desirable to reevaluate whether an
+ * audit record is still required for this event by
+ * re-calling au_preselect().
*/
if (audit_in_failure && suser(td) != 0) {
cv_wait(&audit_fail_cv, &audit_mtx);
@@ -1510,6 +1647,11 @@
td->td_ar = NULL;
}
+/*
+ * audit_syscall_exit() is called from the return of every system call, or in
+ * the event of exit1(), during the execution of exit1(). It is responsible
+ * for committing the audit record, if any, along with return condition.
+ */
void
audit_syscall_exit(int error, struct thread *td)
{
@@ -1543,6 +1685,9 @@
* check the thread audit record pointer anyway, as the audit condition
* could change, and pre-selection may not have allocated an audit
* record for this event.
+ *
+ * XXXAUDIT: Should we assert, in each case, that this field of the record
+ * hasn't already been filled in?
*/
void
audit_arg_addr(void * addr)
@@ -1768,7 +1913,9 @@
if ((ar == NULL) || (p == NULL))
return;
- /* XXX May need to lock the credentials structures */
+ /*
+ * XXXAUDIT: PROC_LOCK_ASSERT(p);
+ */
ar->k_ar.ar_arg_auid = p->p_au->ai_auid;
ar->k_ar.ar_arg_euid = p->p_ucred->cr_uid;
ar->k_ar.ar_arg_egid = p->p_ucred->cr_groups[0];
@@ -1811,6 +1958,10 @@
ar->k_ar.ar_valid_arg |= ARG_SOCKINFO;
}
+/*
+ * XXXAUDIT: Argument here should be 'sa' not 'so'. Caller is responsible
+ * for synchronizing access to the source of the address.
+ */
void
audit_arg_sockaddr(struct thread *td, struct sockaddr *so)
{
@@ -1820,9 +1971,6 @@
if (ar == NULL || td == NULL || so == NULL)
return;
- /*
- * XXX: Do we need to lock the socket?
- */
bcopy(so, &ar->k_ar.ar_arg_sockaddr, sizeof(ar->k_ar.ar_arg_sockaddr));
switch (so->sa_family) {
case AF_INET:
@@ -1836,6 +1984,7 @@
ARG_UPATH1);
ar->k_ar.ar_valid_arg |= ARG_SADDRUNIX;
break;
+ /* XXXAUDIT: default:? */
}
}
@@ -1997,6 +2146,9 @@
struct socket *so;
struct inpcb *pcb;
+ /*
+ * XXXAUDIT: Why is the (ar == NULL) test only in the socket case?
+ */
if (fp->f_type == DTYPE_VNODE) {
audit_arg_vnpath((struct vnode *)fp->f_data, ARG_VNODE1);
return;
@@ -2006,6 +2158,10 @@
ar = currecord();
if (ar == NULL)
return;
+
+ /*
+ * XXXAUDIT: Socket locking? Inpcb locking?
+ */
so = (struct socket *)fp->f_data;
if (INP_CHECK_SOCKAF(so, PF_INET)) {
if (so->so_pcb == NULL)
@@ -2027,6 +2183,7 @@
pcb->inp_lport;
ar->k_ar.ar_valid_arg |= ARG_SOCKINFO;
}
+ /* XXXAUDIT: else? */
}
}
@@ -2041,6 +2198,7 @@
KASSERT(p->p_au == NULL, ("audit_proc_alloc: p->p_au != NULL (%d)",
p->p_pid));
p->p_au = malloc(sizeof(*(p->p_au)), M_AUDIT, M_WAITOK);
+ /* XXXAUDIT: Zero? Slab allocate? */
//printf("audit_proc_alloc: pid %d p_au %p\n", p->p_pid, p->p_au);
}
@@ -2089,6 +2247,10 @@
//printf("audit_proc_fork: child pid %d p_au %p\n", child->p_pid,
// child->p_au);
bcopy(parent->p_au, child->p_au, sizeof(*child->p_au));
+ /*
+ * XXXAUDIT: Zero pointers to external memory, or assert they are
+ * zero?
+ */
}
/*
@@ -2100,6 +2262,9 @@
KASSERT(p->p_au != NULL, ("p->p_au == NULL (%d)", p->p_pid));
//printf("audit_proc_free: pid %d p_au %p\n", p->p_pid, p->p_au);
+ /*
+ * XXXAUDIT: Assert that external memory pointers are NULL?
+ */
free(p->p_au, M_AUDIT);
p->p_au = NULL;
}
@@ -2109,6 +2274,8 @@
* record stored on the user thread. This function will allocate the memory to
* store the path info if not already available. This memory will be
* freed when the audit record is freed.
+ *
+ * XXXAUDIT: Possibly assert that the memory isn't already allocated?
*/
void
audit_arg_upath(struct thread *td, char *upath, u_int64_t flags)
@@ -2119,18 +2286,24 @@
if (td == NULL || upath == NULL)
return; /* nothing to do! */
+ /*
+ * XXXAUDIT: Witness warning for possible sleep here?
+ */
+
+ /*
+ * XXXAUDIT: KASSERT argument validity?
+ */
if ((flags & (ARG_UPATH1 | ARG_UPATH2)) == 0)
return;
ar = currecord();
- if (ar == NULL) /* This will be the case for unaudited system calls */
+ if (ar == NULL)
return;
if (flags & ARG_UPATH1) {
ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_UPATH1);
pathp = &ar->k_ar.ar_arg_upath1;
- }
- else {
+ } else {
ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_UPATH2);
pathp = &ar->k_ar.ar_arg_upath2;
}
@@ -2158,6 +2331,8 @@
* XXX: We should accept the process argument from the caller, since
* it's very likely they already have a reference.
* XXX: Error handling in this function is poor.
+ *
+ * XXXAUDIT: Possibly KASSERT the path pointer is NULL?
*/
void
audit_arg_vnpath(struct vnode *vp, u_int64_t flags)
@@ -2169,9 +2344,15 @@
struct vnode_au_info *vnp;
struct thread *td;
+ /*
+ * XXXAUDIT: Why is vp possibly NULL here?
+ */
if (vp == NULL)
return;
+ /*
+ * XXXAUDIT: Less Giant needed here.
+ */
mtx_assert(&Giant, MA_OWNED);
ASSERT_VOP_LOCKED(vp, "audit_arg_vnpath")
@@ -2179,6 +2360,9 @@
if (ar == NULL) /* This will be the case for unaudited system calls */
return;
+ /*
+ * XXXAUDIT: KASSERT argument validity instead?
+ */
if ((flags & (ARG_VNODE1 | ARG_VNODE2)) == 0)
return;
@@ -2189,8 +2373,7 @@
ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE1);
pathp = &ar->k_ar.ar_arg_kpath1;
vnp = &ar->k_ar.ar_arg_vnode1;
- }
- else {
+ } else {
ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_KPATH2);
ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE2);
pathp = &ar->k_ar.ar_arg_kpath2;
To Unsubscribe: send mail to majordomo at trustedbsd.org
with "unsubscribe trustedbsd-cvs" in the body of the message
More information about the trustedbsd-cvs
mailing list