git: 5fa093b6efcb - main - mount_nfs: make temporary DNS failure non-fatal with background mode

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Fri, 28 Mar 2025 21:37:51 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=5fa093b6efcb7eb16a17d9830dbd4404bff5a565

commit 5fa093b6efcb7eb16a17d9830dbd4404bff5a565
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2025-03-28 21:31:54 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2025-03-28 21:36:40 +0000

    mount_nfs: make temporary DNS failure non-fatal with background mode
    
    Typical problem with network mounts is remote equipment not being
    available when our host boots up after a power failure.  Even if you
    properly configure boot order of all local services and wait for link
    coming up on your NIC, you still may boot faster than some intermediate
    switch on the network or the DNS server itself.  Let's refer to this as a
    "server room boot race".  For NFS mounts with hostname in hosts(5) the
    race is addressed by a retry loop on NFS mount timeout.  However, a DNS
    resolution timeout is treated differently to NFS mount timeout.  We fail
    on the former and keep retrying on the latter.
    
    With feedback received on current@, I see that the problem is so old, that
    people got used to it and see it as a desired behavior rather than a
    problem.  And for those who is affected by the problem, they suggest
    hosts(5) as a solution.  Note that using hosts(5) isn't scalable, and
    using bare IP addresses is neither scalable, nor compatible with
    Kerberized mounts.
    
    A trade-off solution would be to enable the retry cycle over DNS timeouts
    only when background mode is specified, which is a typical use in fstab(5)
    and very uncommon in a command line.  That would address the server room
    boot race problem without breaking POLA for command line.
    
    Reviewed by:            rmacklem
    Differential Revision:  https://reviews.freebsd.org/D49145
---
 sbin/mount_nfs/mount_nfs.c | 59 ++++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/sbin/mount_nfs/mount_nfs.c b/sbin/mount_nfs/mount_nfs.c
index 189bdd70b398..6ba51eeec588 100644
--- a/sbin/mount_nfs/mount_nfs.c
+++ b/sbin/mount_nfs/mount_nfs.c
@@ -587,6 +587,7 @@ getnfsargs(char **specp, char **hostpp, struct iovec **iov, int *iovlen)
 	char *hostp, *delimp, *errstr, *spec;
 	size_t len;
 	static char nam[MNAMELEN + 1], pname[MAXHOSTNAMELEN + 5];
+	bool resolved;
 
 	spec = *specp;
 	if (*spec == '[' && (delimp = strchr(spec + 1, ']')) != NULL &&
@@ -643,30 +644,7 @@ getnfsargs(char **specp, char **hostpp, struct iovec **iov, int *iovlen)
 	else if (nfsproto == IPPROTO_UDP)
 		hints.ai_socktype = SOCK_DGRAM;
 
-	if (getaddrinfo(hostp, portspec, &hints, &ai_nfs) != 0) {
-		hints.ai_flags = AI_CANONNAME;
-		if ((ecode = getaddrinfo(hostp, portspec, &hints, &ai_nfs))
-		    != 0) {
-			if (portspec == NULL)
-				errx(1, "%s: %s", hostp, gai_strerror(ecode));
-			else
-				errx(1, "%s:%s: %s", hostp, portspec,
-				    gai_strerror(ecode));
-			return (0);
-		}
-
-		/*
-		 * For a Kerberized nfs mount where the "principal"
-		 * argument has not been set, add it here.
-		 */
-		if (got_principal == 0 && secflavor != AUTH_SYS &&
-		    ai_nfs->ai_canonname != NULL) {
-			snprintf(pname, sizeof (pname), "nfs@%s",
-			    ai_nfs->ai_canonname);
-			build_iovec(iov, iovlen, "principal", pname,
-			    strlen(pname) + 1);
-		}
-	}
+	resolved = (getaddrinfo(hostp, portspec, &hints, &ai_nfs) == 0);
 
 	if ((opflags & (BGRNDNOW | ISBGRND)) == BGRNDNOW) {
 		warnx("Mount %s:%s, backgrounding",
@@ -678,6 +656,37 @@ getnfsargs(char **specp, char **hostpp, struct iovec **iov, int *iovlen)
 
 	ret = TRYRET_LOCALERR;
 	for (;;) {
+		if (!resolved) {
+			hints.ai_flags = AI_CANONNAME;
+			if ((ecode = getaddrinfo(hostp, portspec, &hints,
+			    &ai_nfs)) != 0) {
+				if (portspec == NULL)
+					warnx("%s: %s", hostp,
+					    gai_strerror(ecode));
+				else
+					warnx("%s:%s: %s", hostp, portspec,
+					    gai_strerror(ecode));
+				if (ecode == EAI_AGAIN &&
+				    (opflags & (BGRNDNOW | BGRND)))
+					goto retry;
+				else
+					exit(1);
+			}
+			resolved = true;
+			/*
+			 * For a Kerberized nfs mount where the
+			 * "principal" argument has not been set, add
+			 * it here.
+			 */
+			if (got_principal == 0 && secflavor != AUTH_SYS &&
+			    ai_nfs->ai_canonname != NULL) {
+				snprintf(pname, sizeof (pname), "nfs@%s",
+				    ai_nfs->ai_canonname);
+				build_iovec(iov, iovlen, "principal", pname,
+				    strlen(pname) + 1);
+			}
+		}
+
 		/*
 		 * Try each entry returned by getaddrinfo(). Note the
 		 * occurrence of remote errors by setting `remoteerr'.
@@ -705,7 +714,7 @@ getnfsargs(char **specp, char **hostpp, struct iovec **iov, int *iovlen)
 		/* Exit if all errors were local. */
 		if (!remoteerr)
 			exit(1);
-
+retry:
 		/*
 		 * If retrycnt == 0, we are to keep retrying forever.
 		 * Otherwise decrement it, and exit if it hits zero.