svn commit: r298004 - head/sys/cam

Scott Long scottl at FreeBSD.org
Thu Apr 14 21:55:43 UTC 2016


Author: scottl
Date: Thu Apr 14 21:55:42 2016
New Revision: 298004
URL: https://svnweb.freebsd.org/changeset/base/298004

Log:
  Add a devctl/devd notification conduit for CAM errors that happen at the
  periph level.  When a relevant error is reported to the periph, some
  amplifying information is gathered, and the error and information are fed
  to devctl with the attributes / keys system=CAM, subsystem=periph.  The
  'type' key will be either 'error' or 'timeout', and based on this, various
  other keys are also populated.
  
  The purpose of this is to provide a concise mechanism for error reporting
  that is less noisy than the system console but higher in resolution and
  fidelity than simple sysctl counters.  We will be using it at Netflix to
  populate a structured log and database to track errors and error trends
  across our world-wide population of drives.
  
  Submitted by:	imp, scottl
  Approved by:	kenm
  MFC after:	3 days
  Sponsored by:	Netflix
  Differential Revision:	D5943

Modified:
  head/sys/cam/cam_periph.c

Modified: head/sys/cam/cam_periph.c
==============================================================================
--- head/sys/cam/cam_periph.c	Thu Apr 14 21:52:31 2016	(r298003)
+++ head/sys/cam/cam_periph.c	Thu Apr 14 21:55:42 2016	(r298004)
@@ -86,6 +86,7 @@ static	int		camperiphscsisenseerror(unio
 					        u_int32_t *timeout,
 					        u_int32_t *action,
 					        const char **action_string);
+static void		cam_periph_devctl_notify(union ccb *ccb);
 
 static int nperiph_drivers;
 static int initialized = 0;
@@ -1615,7 +1616,7 @@ cam_periph_error(union ccb *ccb, cam_fla
 	struct cam_periph *periph;
 	const char *action_string;
 	cam_status  status;
-	int	    frozen, error, openings;
+	int	    frozen, error, openings, devctl_err;
 	u_int32_t   action, relsim_flags, timeout;
 
 	action = SSQ_PRINT_SENSE;
@@ -1624,9 +1625,26 @@ cam_periph_error(union ccb *ccb, cam_fla
 	status = ccb->ccb_h.status;
 	frozen = (status & CAM_DEV_QFRZN) != 0;
 	status &= CAM_STATUS_MASK;
-	openings = relsim_flags = timeout = 0;
+	devctl_err = openings = relsim_flags = timeout = 0;
 	orig_ccb = ccb;
 
+	/* Filter the errors that should be reported via devctl */
+	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
+	case CAM_CMD_TIMEOUT:
+	case CAM_REQ_ABORTED:
+	case CAM_REQ_CMP_ERR:
+	case CAM_REQ_TERMIO:
+	case CAM_UNREC_HBA_ERROR:
+	case CAM_DATA_RUN_ERR:
+	case CAM_SCSI_STATUS_ERROR:
+	case CAM_ATA_STATUS_ERROR:
+	case CAM_SMP_STATUS_ERROR:
+		devctl_err++;
+		break;
+	default:
+		break;
+	}
+
 	switch (status) {
 	case CAM_REQ_CMP:
 		error = 0;
@@ -1754,6 +1772,9 @@ cam_periph_error(union ccb *ccb, cam_fla
 			xpt_print(ccb->ccb_h.path, "Retrying command\n");
 	}
 
+	if (devctl_err)
+		cam_periph_devctl_notify(orig_ccb);
+
 	if ((action & SSQ_LOST) != 0) {
 		lun_id_t lun_id;
 
@@ -1824,3 +1845,80 @@ cam_periph_error(union ccb *ccb, cam_fla
 
 	return (error);
 }
+
+#define CAM_PERIPH_DEVD_MSG_SIZE	256
+
+static void
+cam_periph_devctl_notify(union ccb *ccb)
+{
+	struct cam_periph *periph;
+	struct ccb_getdev *cgd;
+	struct sbuf sb;
+	int serr, sk, asc, ascq;
+	char *sbmsg, *type;
+
+	sbmsg = malloc(CAM_PERIPH_DEVD_MSG_SIZE, M_CAMPERIPH, M_NOWAIT);
+	if (sbmsg == NULL)
+		return;
+
+	sbuf_new(&sb, sbmsg, CAM_PERIPH_DEVD_MSG_SIZE, SBUF_FIXEDLEN);
+
+	periph = xpt_path_periph(ccb->ccb_h.path);
+	sbuf_printf(&sb, "device=%s%d ", periph->periph_name,
+	    periph->unit_number);
+
+	sbuf_printf(&sb, "serial=\"");
+	if ((cgd = (struct ccb_getdev *)xpt_alloc_ccb_nowait()) != NULL) {
+		xpt_setup_ccb(&cgd->ccb_h, ccb->ccb_h.path,
+		    CAM_PRIORITY_NORMAL);
+		cgd->ccb_h.func_code = XPT_GDEV_TYPE;
+		xpt_action((union ccb *)cgd);
+
+		if (cgd->ccb_h.status == CAM_REQ_CMP)
+			sbuf_bcat(&sb, cgd->serial_num, cgd->serial_num_len);
+	}
+	sbuf_printf(&sb, "\" ");
+	sbuf_printf(&sb, "cam_status=\"0x%x\" ", ccb->ccb_h.status);
+
+	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
+	case CAM_CMD_TIMEOUT:
+		sbuf_printf(&sb, "timeout=%d ", ccb->ccb_h.timeout);
+		type = "timeout";
+		break;
+	case CAM_SCSI_STATUS_ERROR:
+		sbuf_printf(&sb, "scsi_status=%d ", ccb->csio.scsi_status);
+		if (scsi_extract_sense_ccb(ccb, &serr, &sk, &asc, &ascq))
+			sbuf_printf(&sb, "scsi_sense=\"%02x %02x %02x %02x\" ",
+			    serr, sk, asc, ascq);
+		type = "error";
+		break;
+	case CAM_ATA_STATUS_ERROR:
+		sbuf_printf(&sb, "RES=\"");
+		ata_res_sbuf(&ccb->ataio.res, &sb);
+		sbuf_printf(&sb, "\" ");
+		type = "error";
+		break;
+	default:
+		type = "error";
+		break;
+	}
+
+	if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
+		sbuf_printf(&sb, "CDB=\"");
+		if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0)
+			scsi_cdb_sbuf(ccb->csio.cdb_io.cdb_ptr, &sb);
+		else
+			scsi_cdb_sbuf(ccb->csio.cdb_io.cdb_bytes, &sb);
+		sbuf_printf(&sb, "\" ");
+	} else if (ccb->ccb_h.func_code == XPT_ATA_IO) {
+		sbuf_printf(&sb, "ACB=\"");
+		ata_cmd_sbuf(&ccb->ataio.cmd, &sb);
+		sbuf_printf(&sb, "\" ");
+	}
+
+	if (sbuf_finish(&sb) == 0)
+		devctl_notify("CAM", "periph", type, sbuf_data(&sb));
+	sbuf_delete(&sb);
+	free(sbmsg, M_CAMPERIPH);
+}
+


More information about the svn-src-head mailing list