Re: git: cd25b0f740f8 - main - zfs: cherry-pick fix from openzfs

From: Mateusz Guzik <mjguzik_at_gmail.com>
Date: Thu, 10 Aug 2023 14:23:50 UTC
On 8/10/23, Martin Matuska <mm@freebsd.org> wrote:
> The branch main has been updated by mm:
>
> URL:
> https://cgit.FreeBSD.org/src/commit/?id=cd25b0f740f8c846562fd66e7380437eb898c875
>
> commit cd25b0f740f8c846562fd66e7380437eb898c875
> Author:     Martin Matuska <mm@FreeBSD.org>
> AuthorDate: 2023-08-10 07:55:42 +0000
> Commit:     Martin Matuska <mm@FreeBSD.org>
> CommitDate: 2023-08-10 07:56:53 +0000
>
>     zfs: cherry-pick fix from openzfs
>
>     Vendor PR:
>       #15080 ZIL: Fix config lock deadlock
>
>     Obtained from:  OpenZFS
>     OpenZFS commit: 2cb992a99ccadb78d97049b40bd442eb4fdc549d
>
>     Note: full vendor imports will continue when stable/14 has been
> branched

First a stylistic issue:
Can you please use upstream one-liner, maybe prefixed with zfs:. For
this commit it would be:
zfs: ZIL: Fix config lock deadlock.

then copy their commit message

For example see
https://cgit.freebsd.org/src/commit/?id=d09a955a605d03471c5ab7bd17b8a6186fdc148c

A not stylistic issue is the delay between upstream fixes and them
getting merged into FreeBSD. For example the commit at hand is over 2
weeks old and I presume this merge got only prompted by des@ running a
zfs-related deadlock.

We really should be getting timely updates without local people
running into them.

> ---
>  sys/contrib/openzfs/module/zfs/zil.c | 34
> +++++++++++++++++++++++++++-------
>  1 file changed, 27 insertions(+), 7 deletions(-)
>
> diff --git a/sys/contrib/openzfs/module/zfs/zil.c
> b/sys/contrib/openzfs/module/zfs/zil.c
> index 00d66a2481d7..af7137faaccf 100644
> --- a/sys/contrib/openzfs/module/zfs/zil.c
> +++ b/sys/contrib/openzfs/module/zfs/zil.c
> @@ -151,6 +151,7 @@ static kmem_cache_t *zil_lwb_cache;
>  static kmem_cache_t *zil_zcw_cache;
>
>  static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
> +static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb);
>  static itx_t *zil_itx_clone(itx_t *oitx);
>
>  static int
> @@ -1768,7 +1769,7 @@ static uint_t zil_maxblocksize =
> SPA_OLD_MAXBLOCKSIZE;
>   * Has to be called under zl_issuer_lock to chain more lwbs.
>   */
>  static lwb_t *
> -zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
> +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs)
>  {
>  	lwb_t *nlwb = NULL;
>  	zil_chain_t *zilc;
> @@ -1870,6 +1871,27 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
>
>  	dmu_tx_commit(tx);
>
> +	/*
> +	 * We need to acquire the config lock for the lwb to issue it later.
> +	 * However, if we already have a queue of closed parent lwbs already
> +	 * holding the config lock (but not yet issued), we can't block here
> +	 * waiting on the lock or we will deadlock.  In that case we must
> +	 * first issue to parent IOs before waiting on the lock.
> +	 */
> +	if (ilwbs && !list_is_empty(ilwbs)) {
> +		if (!spa_config_tryenter(spa, SCL_STATE, lwb, RW_READER)) {
> +			lwb_t *tlwb;
> +			while ((tlwb = list_remove_head(ilwbs)) != NULL)
> +				zil_lwb_write_issue(zilog, tlwb);
> +			spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
> +		}
> +	} else {
> +		spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
> +	}
> +
> +	if (ilwbs)
> +		list_insert_tail(ilwbs, lwb);
> +
>  	/*
>  	 * If there was an allocation failure then nlwb will be null which
>  	 * forces a txg_wait_synced().
> @@ -1933,7 +1955,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
>  		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
>  		    BP_GET_LSIZE(&lwb->lwb_blk));
>  	}
> -	spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
> +	ASSERT(spa_config_held(zilog->zl_spa, SCL_STATE, RW_READER));
>  	zil_lwb_add_block(lwb, &lwb->lwb_blk);
>  	lwb->lwb_issued_timestamp = gethrtime();
>  	zio_nowait(lwb->lwb_root_zio);
> @@ -2037,8 +2059,7 @@ cont:
>  	    lwb_sp < zil_max_waste_space(zilog) &&
>  	    (dlen % max_log_data == 0 ||
>  	    lwb_sp < reclen + dlen % max_log_data))) {
> -		list_insert_tail(ilwbs, lwb);
> -		lwb = zil_lwb_write_close(zilog, lwb);
> +		lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
>  		if (lwb == NULL)
>  			return (NULL);
>  		zil_lwb_write_open(zilog, lwb);
> @@ -2937,8 +2958,7 @@ zil_process_commit_list(zilog_t *zilog,
> zil_commit_waiter_t *zcw, list_t *ilwbs)
>  			    zfs_commit_timeout_pct / 100;
>  			if (sleep < zil_min_commit_timeout ||
>  			    lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) {
> -				list_insert_tail(ilwbs, lwb);
> -				lwb = zil_lwb_write_close(zilog, lwb);
> +				lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
>  				zilog->zl_cur_used = 0;
>  				if (lwb == NULL) {
>  					while ((lwb = list_remove_head(ilwbs))
> @@ -3096,7 +3116,7 @@ zil_commit_waiter_timeout(zilog_t *zilog,
> zil_commit_waiter_t *zcw)
>  	 * since we've reached the commit waiter's timeout and it still
>  	 * hasn't been issued.
>  	 */
> -	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb);
> +	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, NULL);
>
>  	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
>
>


-- 
Mateusz Guzik <mjguzik gmail.com>