git: 693f88c9da8d - main - iconv_std: complete the //IGNORE support

From: Kyle Evans <kevans_at_FreeBSD.org>
Date: Thu, 11 Aug 2022 16:43:49 UTC
The branch main has been updated by kevans:

URL: https://cgit.FreeBSD.org/src/commit/?id=693f88c9da8dccf173b40fd57d1d15504a54e9b4

commit 693f88c9da8dccf173b40fd57d1d15504a54e9b4
Author:     Kyle Evans <kevans@FreeBSD.org>
AuthorDate: 2022-02-22 07:15:04 +0000
Commit:     Kyle Evans <kevans@FreeBSD.org>
CommitDate: 2022-08-11 16:42:20 +0000

    iconv_std: complete the //IGNORE support
    
    Previously, it would only ignore failures due to csmapper conversion
    failure.  It may be the case that the input string contains invalid
    sequences that also need to be ignored.
    
    A good example of //IGNORE application is sanitizing user- or remotely-
    specified strings that are expected to be UTF-8; perhaps as part of a
    pipeline that will feed the result into a system less tested against or
    tolerant of illegal UTF-8 sequences.
    
    Sponsored by:   Klara, Inc.
    Differential Revision:  https://reviews.freebsd.org/D34345
---
 lib/libiconv_modules/iconv_std/citrus_iconv_std.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/lib/libiconv_modules/iconv_std/citrus_iconv_std.c b/lib/libiconv_modules/iconv_std/citrus_iconv_std.c
index ec9f21de541e..73dc75abacbb 100644
--- a/lib/libiconv_modules/iconv_std/citrus_iconv_std.c
+++ b/lib/libiconv_modules/iconv_std/citrus_iconv_std.c
@@ -472,7 +472,7 @@ _citrus_iconv_std_iconv_convert(struct _citrus_iconv * __restrict cv,
 	_csid_t csid;
 	_index_t idx;
 	char *tmpin;
-	size_t inval, szrin, szrout;
+	size_t inval, in_mb_cur_min, szrin, szrout;
 	int ret, state = 0;
 
 	inval = 0;
@@ -504,6 +504,8 @@ _citrus_iconv_std_iconv_convert(struct _citrus_iconv * __restrict cv,
 		return (0);
 	}
 
+	in_mb_cur_min = _stdenc_get_mb_cur_min(is->is_src_encoding);
+
 	/* normal case */
 	for (;;) {
 		if (*inbytes == 0) {
@@ -522,8 +524,20 @@ _citrus_iconv_std_iconv_convert(struct _citrus_iconv * __restrict cv,
 		szrin = szrout = 0;
 		ret = mbtocsx(&sc->sc_src_encoding, &csid, &idx, &tmpin,
 		    *inbytes, &szrin, cv->cv_shared->ci_hooks);
-		if (ret)
+		if (ret != 0 && (ret != EILSEQ ||
+		    !cv->cv_shared->ci_discard_ilseq)) {
 			goto err;
+		} else if (ret == EILSEQ) {
+			/*
+			 * If //IGNORE was specified, we'll just keep crunching
+			 * through invalid characters.
+			 */
+			*in += in_mb_cur_min;
+			*inbytes -= in_mb_cur_min;
+			restore_encoding_state(&sc->sc_src_encoding);
+			restore_encoding_state(&sc->sc_dst_encoding);
+			continue;
+		}
 
 		if (szrin == (size_t)-2) {
 			/* incompleted character */