git: 9e589b093857 - main - tty: fix improper backspace behaviour for UTF8 characters when in canonical mode

From: Christos Margiolis <christos_at_FreeBSD.org>
Date: Sat, 07 Oct 2023 18:00:51 UTC
The branch main has been updated by christos:

URL: https://cgit.FreeBSD.org/src/commit/?id=9e589b0938579f3f4d89fa5c051f845bf754184d

commit 9e589b0938579f3f4d89fa5c051f845bf754184d
Author:     Bojan Novković <bojan.novkovic@fer.hr>
AuthorDate: 2023-10-07 18:00:11 +0000
Commit:     Christos Margiolis <christos@FreeBSD.org>
CommitDate: 2023-10-07 18:00:11 +0000

    tty: fix improper backspace behaviour for UTF8 characters when in canonical mode
    
    This patch adds additional logic in ttydisc_rubchar() to properly handle
    backspace behaviour for UTF-8 characters.
    
    Currently, typing in a backspace after a UTF8 character will delete only
    one byte from the byte sequence, leaving garbled output in the tty's
    output queue. With this change all of the character's bytes are deleted.
    This change is only active when the IUTF8 flag is set (see
    19054eb6053189144aa962b2ecc1bf5087758a3e "(s)tty: add support for IUTF8
    input flag")
    
    The code uses the teken_wcwidth() function to properly handle character
    column widths for different code points, and adds the
    teken_utf8_bytes_to_codepoint() function that converts a UTF-8 byte
    sequence to a codepoint, as specified in RFC3629.
    
    Reported by:    christos
    Reviewed by:    christos, imp
    MFC after:      2 weeks
    Differential Revision:  https://reviews.freebsd.org/D42067
---
 sys/kern/tty_ttydisc.c    | 74 +++++++++++++++++++++++++++++++++++++++++++++++
 sys/teken/teken_wcwidth.h | 30 +++++++++++++++++++
 2 files changed, 104 insertions(+)

diff --git a/sys/kern/tty_ttydisc.c b/sys/kern/tty_ttydisc.c
index 665275ee93e7..eae7162e31c0 100644
--- a/sys/kern/tty_ttydisc.c
+++ b/sys/kern/tty_ttydisc.c
@@ -43,6 +43,9 @@
 #include <sys/uio.h>
 #include <sys/vnode.h>
 
+#include <teken/teken.h>
+#include <teken/teken_wcwidth.h>
+
 /*
  * Standard TTYDISC `termios' line discipline.
  */
@@ -78,8 +81,13 @@ SYSCTL_ULONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD,
 /* Character is alphanumeric. */
 #define CTL_ALNUM(c)	(((c) >= '0' && (c) <= '9') || \
     ((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
+/* Character is UTF8-encoded. */
+#define CTL_UTF8(c) (!!((c) & 0x80))
+/* Character is a UTF8 continuation byte. */
+#define CTL_UTF8_CONT(c) (((c) & 0xc0) == 0x80)
 
 #define	TTY_STACKBUF	256
+#define UTF8_STACKBUF 4
 
 void
 ttydisc_open(struct tty *tp)
@@ -800,6 +808,72 @@ ttydisc_rubchar(struct tty *tp)
 				ttyoutq_write_nofrag(&tp->t_outq,
 				    "\b\b\b\b\b\b\b\b", tablen);
 				return (0);
+			} else if ((tp->t_termios.c_iflag & IUTF8) != 0 &&
+			    CTL_UTF8(c)) {
+				uint8_t bytes[UTF8_STACKBUF] = { 0 };
+				int curidx = UTF8_STACKBUF - 1, cwidth = 1,
+				    nb = 0;
+				teken_char_t codepoint;
+
+				/* Save current byte. */
+				bytes[curidx] = c;
+				curidx--;
+				nb++;
+				/* Loop back through inq until we hit the
+				 * leading byte. */
+				while (CTL_UTF8_CONT(c) && nb < UTF8_STACKBUF) {
+					ttyinq_peekchar(&tp->t_inq, &c, &quote);
+					ttyinq_unputchar(&tp->t_inq);
+					bytes[curidx] = c;
+					curidx--;
+					nb++;
+				}
+				/*
+				 * Shift array so that the leading
+				 * byte ends up at idx 0.
+				 */
+				if (nb < UTF8_STACKBUF)
+					memmove(&bytes[0], &bytes[curidx + 1],
+					    nb * sizeof(uint8_t));
+				/* Check for malformed UTF8 characters. */
+				if (nb == UTF8_STACKBUF &&
+				    CTL_UTF8_CONT(bytes[0])) {
+					/*
+					 * Place all bytes back into the inq and
+					 * delete the last byte only.
+					 */
+					ttyinq_write(&tp->t_inq, bytes,
+					    UTF8_STACKBUF, 0);
+				} else {
+					/* Find codepoint and width. */
+					codepoint =
+					    teken_utf8_bytes_to_codepoint(bytes,
+						nb);
+					if (codepoint !=
+					    TEKEN_UTF8_INVALID_CODEPOINT) {
+						cwidth = teken_wcwidth(
+						    codepoint);
+					} else {
+						/*
+						 * Place all bytes back into the
+						 * inq and fall back to
+						 * default behaviour.
+						 */
+						ttyinq_write(&tp->t_inq, bytes,
+						    nb, 0);
+					}
+				}
+				tp->t_column -= cwidth;
+				/*
+				 * Delete character by punching
+				 * 'cwidth' spaces over it.
+				 */
+				if (cwidth == 1)
+					ttyoutq_write_nofrag(&tp->t_outq,
+					    "\b \b", 3);
+				else if (cwidth == 2)
+					ttyoutq_write_nofrag(&tp->t_outq,
+					    "\b\b  \b\b", 6);
 			} else {
 				/*
 				 * Remove a regular character by
diff --git a/sys/teken/teken_wcwidth.h b/sys/teken/teken_wcwidth.h
index f57a185c2433..f5a23dbc9679 100644
--- a/sys/teken/teken_wcwidth.h
+++ b/sys/teken/teken_wcwidth.h
@@ -8,6 +8,8 @@
  * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
  */
 
+#define TEKEN_UTF8_INVALID_CODEPOINT -1
+
 struct interval {
   teken_char_t first;
   teken_char_t last;
@@ -116,3 +118,31 @@ static int teken_wcwidth(teken_char_t ucs)
       (ucs >= 0x20000 && ucs <= 0x2fffd) ||
       (ucs >= 0x30000 && ucs <= 0x3fffd)));
 }
+
+/*
+ * Converts an UTF-8 byte sequence to a codepoint as specified in
+ * https://datatracker.ietf.org/doc/html/rfc3629#section-3 . The function
+ * expects the 'bytes' array to start with the leading character.
+ */
+static teken_char_t
+teken_utf8_bytes_to_codepoint(uint8_t bytes[4], int nbytes)
+{
+
+  /* Check for malformed characters. */
+  if (bitcount(bytes[0] & 0xf0) != nbytes)
+    return (TEKEN_UTF8_INVALID_CODEPOINT);
+
+  switch (nbytes) {
+  case 1:
+    return (bytes[0] & 0x7f);
+  case 2:
+    return (bytes[0] & 0xf) << 6 | (bytes[1] & 0x3f);
+  case 3:
+    return (bytes[0] & 0xf) << 12 | (bytes[1] & 0x3f) << 6 | (bytes[2] & 0x3f);
+  case 4:
+    return (bytes[0] & 0x7) << 18 | (bytes[1] & 0x3f) << 12 |
+	(bytes[2] & 0x3f) << 6 | (bytes[3] & 0x3f);
+  default:
+    return (TEKEN_UTF8_INVALID_CODEPOINT);
+  }
+}