svn commit: r256027 - stable/9/sbin/hastd

Thu Oct 3 18:52:06 UTC 2013

Author: trociny
Date: Thu Oct  3 18:52:04 2013
New Revision: 256027
URL: http://svnweb.freebsd.org/changeset/base/256027

Log:
  MFC r255714, r255716, r255717:
  
  r255714:
  
  Use cv_broadcast() instead of cv_signal() when waking up threads
  waiting on an empty queue as the queue may have several consumers.
  
  Before the fix the following scenario was possible: 2 threads are
  waiting on empty queue, 2 threads are inserting simultaneously. The
  first inserting thread detects that the queue is empty and is going to
  send the signal, but before it sends the second thread inserts
  too. When the first sends the signal only one of the waiting threads
  receive it while the other one may wait forever.
  
  The scenario above is is believed to be the cause of the observed
  cases, when ggate_recv_thread() was getting stuck on taking free
  request, while the free queue was not empty.
  
  Reviewed by:	pjd
  Tested by:	Yamagi Burmeister yamagi.org
  
  r255716:
  
  When updating the map of dirty extents, most recently used extents are
  kept dirty to reduce the number of on-disk metadata updates. The
  sequence of operations is:
  
  1) acquire the activemap lock;
  2) update in-memory map;
  3) if the list of keepdirty extents is changed, update on-disk metadata;
  4) release the lock.
  
  On-disk updates are not frequent in comparison with in-memory updates,
  while require much more time. So situations are possible when one
  thread is updating on-disk metadata and another one is waiting for the
  activemap lock just to update the in-memory map.
  
  Improve this by introducing additional, on-disk map lock: when
  in-memory map is updated and it is detected that the on-disk map needs
  update too, the on-disk map lock is acquired and the on-memory lock is
  released before flushing the map.
  
  Reported by:	Yamagi Burmeister yamagi.org
  Tested by:	Yamagi Burmeister yamagi.org
  Reviewed by:	pjd
  
  r255717:
  
  Fix comments.

Modified:
  stable/9/sbin/hastd/hast.h
  stable/9/sbin/hastd/primary.c
  stable/9/sbin/hastd/secondary.c
Directory Properties:
  stable/9/sbin/hastd/   (props changed)

Modified: stable/9/sbin/hastd/hast.h
==============================================================================

--- stable/9/sbin/hastd/hast.h	Thu Oct  3 18:50:09 2013	(r256026)
+++ stable/9/sbin/hastd/hast.h	Thu Oct  3 18:52:04 2013	(r256027)
@@ -226,8 +226,10 @@ struct hast_resource {
 
 	/* Activemap structure. */
 	struct activemap *hr_amp;
-	/* Locked used to synchronize access to hr_amp. */
+	/* Lock used to synchronize access to hr_amp. */
 	pthread_mutex_t hr_amp_lock;
+	/* Lock used to synchronize access to hr_amp diskmap. */
+	pthread_mutex_t hr_amp_diskmap_lock;
 
 	/* Number of BIO_READ requests. */
 	uint64_t	hr_stat_read;

Modified: stable/9/sbin/hastd/primary.c
==============================================================================
--- stable/9/sbin/hastd/primary.c	Thu Oct  3 18:50:09 2013	(r256026)
+++ stable/9/sbin/hastd/primary.c	Thu Oct  3 18:52:04 2013	(r256027)
@@ -172,7 +172,7 @@ static pthread_mutex_t metadata_lock;
 	    hio_next[(ncomp)]);						\
 	mtx_unlock(&hio_##name##_list_lock[ncomp]);			\
 	if (_wakeup)							\
-		cv_signal(&hio_##name##_list_cond[(ncomp)]);		\
+		cv_broadcast(&hio_##name##_list_cond[(ncomp)]);		\
 } while (0)
 #define	QUEUE_INSERT2(hio, name)	do {				\
 	bool _wakeup;							\
@@ -182,7 +182,7 @@ static pthread_mutex_t metadata_lock;
 	TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_##name##_next);\
 	mtx_unlock(&hio_##name##_list_lock);				\
 	if (_wakeup)							\
-		cv_signal(&hio_##name##_list_cond);			\
+		cv_broadcast(&hio_##name##_list_cond);			\
 } while (0)
 #define	QUEUE_TAKE1(hio, name, ncomp, timeout)	do {			\
 	bool _last;							\
@@ -291,22 +291,28 @@ primary_exitx(int exitcode, const char *
 	exit(exitcode);
 }
 
+/* Expects res->hr_amp locked, returns unlocked. */
 static int
 hast_activemap_flush(struct hast_resource *res)
 {
 	const unsigned char *buf;
 	size_t size;
+	int ret;
 
+	mtx_lock(&res->hr_amp_diskmap_lock);
 	buf = activemap_bitmap(res->hr_amp, &size);
+	mtx_unlock(&res->hr_amp_lock);
 	PJDLOG_ASSERT(buf != NULL);
 	PJDLOG_ASSERT((size % res->hr_local_sectorsize) == 0);
+	ret = 0;
 	if (pwrite(res->hr_localfd, buf, size, METADATA_SIZE) !=
 	    (ssize_t)size) {
 		pjdlog_errno(LOG_ERR, "Unable to flush activemap to disk");
 		res->hr_stat_activemap_write_error++;
-		return (-1);
+		ret = -1;
 	}
-	if (res->hr_metaflush == 1 && g_flush(res->hr_localfd) == -1) {
+	if (ret == 0 && res->hr_metaflush == 1 &&
+	    g_flush(res->hr_localfd) == -1) {
 		if (errno == EOPNOTSUPP) {
 			pjdlog_warning("The %s provider doesn't support flushing write cache. Disabling it.",
 			    res->hr_localpath);
@@ -315,10 +321,11 @@ hast_activemap_flush(struct hast_resourc
 			pjdlog_errno(LOG_ERR,
 			    "Unable to flush disk cache on activemap update");
 			res->hr_stat_activemap_flush_error++;
-			return (-1);
+			ret = -1;
 		}
 	}
-	return (0);
+	mtx_unlock(&res->hr_amp_diskmap_lock);
+	return (ret);
 }
 
 static bool
@@ -783,6 +790,7 @@ init_remote(struct hast_resource *res, s
 		 * Now that we merged bitmaps from both nodes, flush it to the
 		 * disk before we start to synchronize.
 		 */
+		mtx_lock(&res->hr_amp_lock);
 		(void)hast_activemap_flush(res);
 	}
 	nv_free(nvin);
@@ -1288,8 +1296,9 @@ ggate_recv_thread(void *arg)
 			    ggio->gctl_offset, ggio->gctl_length)) {
 				res->hr_stat_activemap_update++;
 				(void)hast_activemap_flush(res);
+			} else {
+				mtx_unlock(&res->hr_amp_lock);
 			}
-			mtx_unlock(&res->hr_amp_lock);
 			break;
 		case BIO_DELETE:
 			res->hr_stat_delete++;
@@ -1649,8 +1658,9 @@ done_queue:
 			if (activemap_need_sync(res->hr_amp, ggio->gctl_offset,
 			    ggio->gctl_length)) {
 				(void)hast_activemap_flush(res);
+			} else {
+				mtx_unlock(&res->hr_amp_lock);
 			}
-			mtx_unlock(&res->hr_amp_lock);
 			if (hio->hio_replication == HAST_REPLICATION_MEMSYNC)
 				(void)refcnt_release(&hio->hio_countdown);
 		}
@@ -1917,8 +1927,9 @@ ggate_send_thread(void *arg)
 			    ggio->gctl_offset, ggio->gctl_length)) {
 				res->hr_stat_activemap_update++;
 				(void)hast_activemap_flush(res);
+			} else {
+				mtx_unlock(&res->hr_amp_lock);
 			}
-			mtx_unlock(&res->hr_amp_lock);
 		}
 		if (ggio->gctl_cmd == BIO_WRITE) {
 			/*
@@ -2014,8 +2025,11 @@ sync_thread(void *arg __unused)
 			 */
 			if (activemap_extent_complete(res->hr_amp, syncext))
 				(void)hast_activemap_flush(res);
+			else
+				mtx_unlock(&res->hr_amp_lock);
+		} else {
+			mtx_unlock(&res->hr_amp_lock);
 		}
-		mtx_unlock(&res->hr_amp_lock);
 		if (dorewind) {
 			dorewind = false;
 			if (offset == -1)

Modified: stable/9/sbin/hastd/secondary.c
==============================================================================
--- stable/9/sbin/hastd/secondary.c	Thu Oct  3 18:50:09 2013	(r256026)
+++ stable/9/sbin/hastd/secondary.c	Thu Oct  3 18:52:04 2013	(r256027)
@@ -85,14 +85,13 @@ static TAILQ_HEAD(, hio) hio_free_list;
 static pthread_mutex_t hio_free_list_lock;
 static pthread_cond_t hio_free_list_cond;
 /*
- * Disk thread (the one that do I/O requests) takes requests from this list.
+ * Disk thread (the one that does I/O requests) takes requests from this list.
  */
 static TAILQ_HEAD(, hio) hio_disk_list;
 static pthread_mutex_t hio_disk_list_lock;
 static pthread_cond_t hio_disk_list_cond;
 /*
- * There is one recv list for every component, although local components don't
- * use recv lists as local requests are done synchronously.
+ * Thread that sends requests back to primary takes requests from this list.
  */
 static TAILQ_HEAD(, hio) hio_send_list;
 static pthread_mutex_t hio_send_list_lock;
@@ -115,7 +114,7 @@ static void *send_thread(void *arg);
 	TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_next);		\
 	mtx_unlock(&hio_##name##_list_lock);				\
 	if (_wakeup)							\
-		cv_signal(&hio_##name##_list_cond);			\
+		cv_broadcast(&hio_##name##_list_cond);			\
 } while (0)
 #define	QUEUE_TAKE(name, hio)	do {					\
 	mtx_lock(&hio_##name##_list_lock);				\