git: d803854bccb9 - main - libc/getaddrinfo(2): return EAI_AGAIN on nameserver timeout

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Fri, 28 Mar 2025 21:37:50 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=d803854bccb9ea527c1769ac403e011ff0e121e5

commit d803854bccb9ea527c1769ac403e011ff0e121e5
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2025-03-28 21:35:35 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2025-03-28 21:36:40 +0000

    libc/getaddrinfo(2): return EAI_AGAIN on nameserver timeout
    
    A nameserver timeout is a soft failure, future attempts may succeed.
    Returning EAI_AGAIN is crucial for API users to tell a soft name
    resolution failure from negative resolution result.
    
    Before the change we would return EAI_ADDRFAMILY, which I believe, is a
    regression from 144361386696, and before that revision we used to return
    EAI_NONAME in most of the cases.
    
    Reviewed by:            kib
    Differential Revision:  https://reviews.freebsd.org/D49411
---
 lib/libc/net/getaddrinfo.c                   | 58 +++++++++++++++++++++-------
 lib/libc/tests/net/getaddrinfo/getaddrinfo.c | 53 ++++++++++++-------------
 2 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/lib/libc/net/getaddrinfo.c b/lib/libc/net/getaddrinfo.c
index 2b9499d5099b..b8af23ebe8da 100644
--- a/lib/libc/net/getaddrinfo.c
+++ b/lib/libc/net/getaddrinfo.c
@@ -2341,9 +2341,14 @@ _dns_getaddrinfo(void *rv, void *cb_data, va_list ap)
 	if (res_searchN(hostname, &q, res) < 0) {
 		free(buf);
 		free(buf2);
-		if (res->res_h_errno == NO_DATA)
+		switch (res->res_h_errno) {
+		case NO_DATA:
 			return (NS_ADDRFAMILY);
-		return (NS_NOTFOUND);
+		case TRY_AGAIN:
+			return (NS_TRYAGAIN);
+		default:
+			return (NS_NOTFOUND);
+		}
 	}
 	/* prefer IPv6 */
 	if (q.next) {
@@ -2705,9 +2710,18 @@ res_queryN(const char *name, struct res_target *target, res_state res)
 	int n;
 	u_int oflags;
 	struct res_target *t;
-	int rcode;
+	u_int rcode;
 	int ancount;
 
+	/*
+	 * Extend rcode values in the scope of this function.  The DNS header
+	 * rcode we use in this function (hp->rcode) is limited by 4 bits, so
+	 * anything starting from 16 is safe wrt aliasing.  However, nameser.h
+	 * already has extended enum __ns_rcode, so for future safety let's use
+	 * even larger values.
+	 */
+#define	RCODE_UNREACH	32
+#define	RCODE_TIMEDOUT	33
 	rcode = NOERROR;
 	ancount = 0;
 
@@ -2768,7 +2782,29 @@ again:
 					printf(";; res_nquery: retry without EDNS0\n");
 				goto again;
 			}
-			rcode = hp->rcode;	/* record most recent error */
+                        /*
+			 * Historically if a DNS server replied with ICMP port
+			 * unreach res_nsend() would signal that with
+			 * ECONNREFUSED and the upper layers would convert that
+			 * into TRY_AGAIN.  See 3a0b3b673936b and deeper.
+			 * Also, res_nsend() may set errno to ECONNREFUSED due
+			 * to internal failures.  This may not be intentional,
+			 * but we also treat that as soft failures.
+			 *
+			 * A more practical case is when a DNS server(s) were
+			 * queried and didn't respond anything, which usually
+			 * indicates a soft network failure.
+			 */
+			switch (errno) {
+			case ECONNREFUSED:
+				rcode = RCODE_UNREACH;
+				break;
+			case ETIMEDOUT:
+				rcode = RCODE_TIMEDOUT;
+				break;
+			default:
+                                rcode = hp->rcode;
+			}
 #ifdef DEBUG
 			if (res->options & RES_DEBUG)
 				printf(";; res_query: send error\n");
@@ -2800,6 +2836,8 @@ again:
 		case NXDOMAIN:
 			RES_SET_H_ERRNO(res, HOST_NOT_FOUND);
 			break;
+		case RCODE_UNREACH:
+		case RCODE_TIMEDOUT:
 		case SERVFAIL:
 			RES_SET_H_ERRNO(res, TRY_AGAIN);
 			break;
@@ -2862,10 +2900,6 @@ res_searchN(const char *name, struct res_target *target, res_state res)
 		ret = res_querydomainN(name, NULL, target, res);
 		if (ret > 0 || trailing_dot)
 			return (ret);
-		if (errno == ECONNREFUSED) {
-			RES_SET_H_ERRNO(res, TRY_AGAIN);
-			return (-1);
-		}
 		switch (res->res_h_errno) {
 		case NO_DATA:
 		case HOST_NOT_FOUND:
@@ -2906,7 +2940,6 @@ res_searchN(const char *name, struct res_target *target, res_state res)
 			ret = res_querydomainN(name, *domain, target, res);
 			if (ret > 0)
 				return (ret);
-
 			/*
 			 * If no server present, give up.
 			 * If name isn't found in this domain,
@@ -2920,11 +2953,6 @@ res_searchN(const char *name, struct res_target *target, res_state res)
 			 * but try the input name below in case it's
 			 * fully-qualified.
 			 */
-			if (errno == ECONNREFUSED) {
-				RES_SET_H_ERRNO(res, TRY_AGAIN);
-				return (-1);
-			}
-
 			switch (res->res_h_errno) {
 			case NO_DATA:
 				got_nodata++;
@@ -2933,8 +2961,8 @@ res_searchN(const char *name, struct res_target *target, res_state res)
 				/* keep trying */
 				break;
 			case TRY_AGAIN:
-				got_servfail++;
 				if (hp->rcode == SERVFAIL) {
+					got_servfail++;
 					/* try next search element, if any */
 					break;
 				}
diff --git a/lib/libc/tests/net/getaddrinfo/getaddrinfo.c b/lib/libc/tests/net/getaddrinfo/getaddrinfo.c
index 9d8575232d2a..022405f464ce 100644
--- a/lib/libc/tests/net/getaddrinfo/getaddrinfo.c
+++ b/lib/libc/tests/net/getaddrinfo/getaddrinfo.c
@@ -122,19 +122,14 @@ ATF_TC_BODY(timeout, tc)
 
 	resconf = badresolvconf;
 	rv = getaddrinfo(goodname, NULL, &hints, &res);
-	/*
-	 * XXXGL: EAI_ADDRFAMILY is most likely a regression from 144361386696.
-	 * Error code on timeout used to be EAI_NONAME for many years and IMHO
-	 * this is not correct either.  Should be EAI_AGAIN.
-	 */
-	ATF_REQUIRE_MSG(rv == EAI_ADDRFAMILY,
-	    "Expected %d (EAI_ADDRFAMILY), got %d (%s)",
-	    EAI_ADDRFAMILY, rv, gai_strerror(rv));
+	ATF_REQUIRE_MSG(rv == EAI_AGAIN,
+	    "Expected %d (EAI_AGAIN), got %d (%s)",
+	    EAI_AGAIN, rv, gai_strerror(rv));
 
 	rv = getaddrinfo(goodname_dot, NULL, &hints, &res);
-	ATF_REQUIRE_MSG(rv == EAI_ADDRFAMILY,
-	    "Expected %d (EAI_ADDRFAMILY), got %d (%s)",
-	    EAI_ADDRFAMILY, rv, gai_strerror(rv));
+	ATF_REQUIRE_MSG(rv == EAI_AGAIN,
+	    "Expected %d (EAI_AGAIN), got %d (%s)",
+	    EAI_AGAIN, rv, gai_strerror(rv));
 }
 
 ATF_TC_WITHOUT_HEAD(timeout_specific);
@@ -150,14 +145,14 @@ ATF_TC_BODY(timeout_specific, tc)
 
 	resconf = badresolvconf;
 	rv = getaddrinfo(goodname, "666", &hints, &res);
-	ATF_REQUIRE_MSG(rv == EAI_ADDRFAMILY,
-	    "Expected %d (EAI_ADDRFAMILY), got %d (%s)",
-	    EAI_ADDRFAMILY, rv, gai_strerror(rv));
+	ATF_REQUIRE_MSG(rv == EAI_AGAIN,
+	    "Expected %d (EAI_AGAIN), got %d (%s)",
+	    EAI_AGAIN, rv, gai_strerror(rv));
 
 	rv = getaddrinfo(goodname_dot, "666", &hints, &res);
-	ATF_REQUIRE_MSG(rv == EAI_ADDRFAMILY,
-	    "Expected %d (EAI_ADDRFAMILY), got %d (%s)",
-	    EAI_ADDRFAMILY, rv, gai_strerror(rv));
+	ATF_REQUIRE_MSG(rv == EAI_AGAIN,
+	    "Expected %d (EAI_AGAIN), got %d (%s)",
+	    EAI_AGAIN, rv, gai_strerror(rv));
 }
 
 ATF_TC_WITHOUT_HEAD(timeout2);
@@ -172,14 +167,14 @@ ATF_TC_BODY(timeout2, tc)
 
 	resconf = badresolvconf2;
 	rv = getaddrinfo(goodname, NULL, &hints, &res);
-	ATF_REQUIRE_MSG(rv == EAI_ADDRFAMILY,
-	    "Expected %d (EAI_ADDRFAMILY), got %d (%s)",
-	    EAI_ADDRFAMILY, rv, gai_strerror(rv));
+	ATF_REQUIRE_MSG(rv == EAI_AGAIN,
+	    "Expected %d (EAI_AGAIN), got %d (%s)",
+	    EAI_AGAIN, rv, gai_strerror(rv));
 
 	rv = getaddrinfo(goodname_dot, NULL, &hints, &res);
-	ATF_REQUIRE_MSG(rv == EAI_ADDRFAMILY,
-	    "Expected %d (EAI_ADDRFAMILY), got %d (%s)",
-	    EAI_ADDRFAMILY, rv, gai_strerror(rv));
+	ATF_REQUIRE_MSG(rv == EAI_AGAIN,
+	    "Expected %d (EAI_AGAIN), got %d (%s)",
+	    EAI_AGAIN, rv, gai_strerror(rv));
 }
 
 /*
@@ -197,14 +192,14 @@ ATF_TC_BODY(netdown, tc)
 
 	send_error = ENETDOWN;
 	rv = getaddrinfo(goodname, NULL, &hints, &res);
-	ATF_REQUIRE_MSG(rv == EAI_NONAME,
-	    "Expected %d (EAI_NONAME), got %d (%s)",
-	    EAI_NONAME, rv, gai_strerror(rv));
+	ATF_REQUIRE_MSG(rv == EAI_AGAIN,
+	    "Expected %d (EAI_AGAIN), got %d (%s)",
+	    EAI_AGAIN, rv, gai_strerror(rv));
 
 	rv = getaddrinfo(goodname_dot, NULL, &hints, &res);
-	ATF_REQUIRE_MSG(rv == EAI_ADDRFAMILY,
-	    "Expected %d (EAI_ADDRFAMILY), got %d (%s)",
-	    EAI_ADDRFAMILY, rv, gai_strerror(rv));
+	ATF_REQUIRE_MSG(rv == EAI_AGAIN,
+	    "Expected %d (EAI_AGAIN), got %d (%s)",
+	    EAI_AGAIN, rv, gai_strerror(rv));
 }
 
 /*