LCOV - code coverage report
Current view: top level - tests/functional - test_text_rendering_safety.c (source / functions) Coverage Total Hit
Test: coverage.info Lines: 95.7 % 234 224
Test Date: 2026-04-20 19:54:22 Functions: 100.0 % 20 20

            Line data    Source code
       1              : /**
       2              :  * @file test_text_rendering_safety.c
       3              :  * @brief TEST-78 / US-27 — Unicode + ANSI-escape safety for screen rendering.
       4              :  *
       5              :  * The subject is src/tui/screen.c::screen_put_str[_n], which
       6              :  *   1. decodes UTF-8 (utf8_decode),
       7              :  *   2. rewrites hazardous control codepoints to U+00B7 MIDDLE DOT
       8              :  *      (SEC-01 sanitisation block, screen.c L111-119), and
       9              :  *   3. stores the resulting codepoint + display-width in the Screen
      10              :  *      back buffer via terminal_wcwidth().
      11              :  *
      12              :  * All three steps are pure: given a UTF-8 string, the only observable
      13              :  * side-effect is the mutation of Screen::back[].  No socket, MTProto,
      14              :  * mock server, or TL parsing is required — the functional test drives
      15              :  * the production domain end-to-end by writing a message body verbatim
      16              :  * into a Screen and inspecting the resulting cell grid.
      17              :  *
      18              :  * Scenarios covered (12 tests):
      19              :  *   1.  ANSI CSI erase-display  (ESC [ 2 J)
      20              :  *   2.  OSC title-set           (ESC ] 0 ; evil BEL)
      21              :  *   3.  BEL + 8-bit CSI + DEL   (0x07, 0x9B, 0x7F)
      22              :  *   4.  \t and \n are preserved (only allowed low-controls)
      23              :  *   5.  Emoji — smiley          (U+1F600, width 2)
      24              :  *   6.  CJK                     ("你好", 4 cols)
      25              :  *   7.  RTL — Hebrew "שלום"     (stored LTR in logical order)
      26              :  *   8.  Zero-width joiner       (👨‍👩‍👧: ZWJ family cluster)
      27              :  *   9.  Combining mark          ("e" + U+0301 → width 1)
      28              :  *  10.  Malformed UTF-8         (three invalid bytes → three U+FFFD)
      29              :  *  11.  Overlong UTF-8          (5-byte F8-lead → U+FFFD)
      30              :  *  12.  UTF-8 BOM inside body   (U+FEFF is zero-width, not garbage)
      31              :  *
      32              :  * Assertion helpers:
      33              :  *   assert_no_raw_escape_bytes — scans each cell for codepoints 0x1B,
      34              :  *      0x07, 0x9B, 0x7F; none must survive sanitisation.
      35              :  *   find_cp / count_cp — scan row for codepoint matches.
      36              :  *
      37              :  * TEST-55 remains the dialog-title width-only variant; this suite adds
      38              :  * the missing message-body + injection + malformed-UTF-8 coverage that
      39              :  * TEST-55 did not reach.
      40              :  */
      41              : 
      42              : #include "test_helpers.h"
      43              : 
      44              : #include "tui/screen.h"
      45              : #include "platform/terminal.h"
      46              : 
      47              : #include <locale.h>
      48              : #include <stdint.h>
      49              : #include <stdio.h>
      50              : #include <stdlib.h>
      51              : #include <string.h>
      52              : 
      53              : /* ---- shared helpers ---- */
      54              : 
      55              : /**
      56              :  * Fail the test if any cell in @p s carries a raw control-character
      57              :  * codepoint that SEC-01 was supposed to neutralise.
      58              :  *
      59              :  * The four bytes targeted are exactly the ones a malicious Telegram
      60              :  * message could use to break out of our display area:
      61              :  *   0x1B  ESC — lead byte of every ANSI CSI / OSC / DEC sequence.
      62              :  *   0x07  BEL — audible bell, and terminator of OSC title sequences.
      63              :  *   0x9B  CSI — 8-bit introducer equivalent to "ESC [".
      64              :  *   0x7F  DEL — some terminals interpret as erase-under-cursor.
      65              :  */
      66           16 : static void assert_no_raw_escape_bytes(const Screen *s, const char *where) {
      67           16 :     int total = s->rows * s->cols;
      68         1040 :     for (int i = 0; i < total; i++) {
      69         1024 :         uint32_t cp = s->back[i].cp;
      70         1024 :         if (cp == 0x1B || cp == 0x07 || cp == 0x9B || cp == 0x7F) {
      71            0 :             printf("  [FAIL] %s: raw control cp=0x%02X at cell %d\n",
      72              :                    where, (unsigned)cp, i);
      73            0 :             g_tests_failed++;
      74            0 :             return;
      75              :         }
      76              :     }
      77              : }
      78              : 
      79              : /**
      80              :  * After screen_flip, scan the emitted byte stream for 0x07 / 0x9B / 0x7F
      81              :  * — these would only reach stdout if the sanitiser had failed, because
      82              :  * the flipper never emits those bytes itself (it only emits ESC for
      83              :  * CUP/SGR/DECTCEM sequences, which is expected and therefore excluded
      84              :  * from the check).
      85              :  *
      86              :  * The 0x1B byte is deliberately NOT flagged here: screen_flip always
      87              :  * writes CUP ("\033[…H") and SGR ("\033[…m") framing before UTF-8
      88              :  * cell contents.  We check for the non-framing escape bytes that
      89              :  * should NEVER appear in our output.
      90              :  */
      91           24 : static void assert_no_user_escape_in_stream(const char *buf, size_t len,
      92              :                                             const char *where) {
      93         1864 :     for (size_t i = 0; i < len; i++) {
      94         1840 :         unsigned char b = (unsigned char)buf[i];
      95         1840 :         if (b == 0x07 || b == 0x9B || b == 0x7F) {
      96            0 :             printf("  [FAIL] %s: raw byte 0x%02X at stream offset %zu\n",
      97              :                    where, (unsigned)b, i);
      98            0 :             g_tests_failed++;
      99            0 :             return;
     100              :         }
     101              :     }
     102              : }
     103              : 
     104              : /** Return the column of the first cell in @p row whose cp == @p needle,
     105              :  *  or -1 if no such cell exists. */
     106            6 : static int find_cp(const Screen *s, int row, uint32_t needle) {
     107          390 :     for (int c = 0; c < s->cols; c++) {
     108          384 :         if (s->back[(size_t)row * s->cols + c].cp == needle) return c;
     109              :     }
     110            6 :     return -1;
     111              : }
     112              : 
     113              : /** Count occurrences of codepoint @p needle on @p row. */
     114            2 : static int count_cp(const Screen *s, int row, uint32_t needle) {
     115            2 :     int n = 0;
     116          130 :     for (int c = 0; c < s->cols; c++) {
     117          128 :         if (s->back[(size_t)row * s->cols + c].cp == needle) n++;
     118              :     }
     119            2 :     return n;
     120              : }
     121              : 
     122              : /**
     123              :  * Per-test scratch buffer backed by open_memstream() so each test can
     124              :  * examine the exact bytes screen_flip emits (and confirm no raw
     125              :  * escape payload leaks into the terminal stream).
     126              :  */
     127              : typedef struct {
     128              :     char  *buf;
     129              :     size_t len;
     130              :     FILE  *out;
     131              : } RenderSink;
     132              : 
     133              : /** Small screen used by every test: single row is easiest to reason about
     134              :  *  and still exercises utf8_decode + sanitiser + wcwidth. */
     135           24 : static void screen_setup(Screen *s, RenderSink *rs) {
     136              :     /* terminal_wcwidth() delegates to POSIX wcwidth(3), which on glibc
     137              :      * returns -1 for any non-ASCII codepoint under the default "C" locale.
     138              :      * Enable the environment locale (typically *.UTF-8) so wide/narrow
     139              :      * classification works for emoji, CJK, combining marks, etc.  The
     140              :      * TEST-55 sibling test in tests/unit/test_platform.c does the same. */
     141           24 :     setlocale(LC_ALL, "");
     142              : 
     143              :     /* 1 row × 64 cols is enough for the longest fixture (the CSI test). */
     144           24 :     int rc = screen_init(s, 1, 64);
     145           24 :     if (rc != 0) {
     146            0 :         printf("  [FATAL] screen_init failed\n");
     147            0 :         abort();
     148              :     }
     149              :     /* Route the ANSI byte stream into a memstream so each test can
     150              :      * both (a) see exactly what screen_flip emits and (b) exercise
     151              :      * the utf8_encode / sgr_encode / cup_encode / screen_flip paths
     152              :      * — none of which the back-buffer inspection alone would cover. */
     153           24 :     rs->buf = NULL; rs->len = 0;
     154           24 :     rs->out = open_memstream(&rs->buf, &rs->len);
     155           24 :     if (!rs->out) {
     156            0 :         printf("  [FATAL] open_memstream failed\n");
     157            0 :         abort();
     158              :     }
     159           24 :     s->out = rs->out;
     160           24 : }
     161              : 
     162           24 : static void screen_teardown(Screen *s, RenderSink *rs) {
     163           24 :     if (rs->out) { fclose(rs->out); rs->out = NULL; }
     164           24 :     free(rs->buf); rs->buf = NULL; rs->len = 0;
     165           24 :     screen_free(s);
     166           24 : }
     167              : 
     168              : /** Flush the back buffer through screen_flip and return the bytes the
     169              :  *  flipper wrote to the memstream.  Must be called before inspecting
     170              :  *  rs->buf / rs->len. */
     171           24 : static void screen_drain(Screen *s, RenderSink *rs) {
     172           24 :     (void)screen_flip(s);
     173           24 :     fflush(rs->out);
     174           24 : }
     175              : 
     176              : /* ================================================================ */
     177              : /* Tests                                                            */
     178              : /* ================================================================ */
     179              : 
     180              : /* 1. ANSI CSI erase-display inside a message body must become MIDDLE DOTs. */
     181            2 : static void test_plain_history_strips_ansi_csi(void) {
     182            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     183              : 
     184              :     /* "A" + ESC + "[2J" + "B" — ESC is U+001B which the sanitiser rewrites
     185              :      * to U+00B7 MIDDLE DOT (width 1 in a UTF-8 locale); the other bytes
     186              :      * are printable ASCII that pass through untouched. */
     187            2 :     const char msg[] = "A\x1b[2JB";
     188            2 :     int cols = screen_put_str(&s, 0, 0, msg, 0);
     189            2 :     ASSERT(cols == 6, "6 cols: A + MIDDLE_DOT + [ + 2 + J + B");
     190              : 
     191            2 :     ASSERT(s.back[0].cp == (uint32_t)'A', "cell 0 is 'A'");
     192            2 :     ASSERT(s.back[1].cp == 0x00B7,        "cell 1 is U+00B7 (ESC replaced)");
     193            2 :     ASSERT(s.back[2].cp == (uint32_t)'[', "cell 2 is '['");
     194            2 :     ASSERT(s.back[3].cp == (uint32_t)'2', "cell 3 is '2'");
     195            2 :     ASSERT(s.back[4].cp == (uint32_t)'J', "cell 4 is 'J'");
     196            2 :     ASSERT(s.back[5].cp == (uint32_t)'B', "cell 5 is 'B'");
     197            2 :     assert_no_raw_escape_bytes(&s, "ansi_csi");
     198              : 
     199            2 :     screen_drain(&s, &rs);
     200            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     201            2 :     screen_teardown(&s, &rs);
     202              : }
     203              : 
     204              : /* 2. OSC title-set sequence: ESC ]0;evil BEL */
     205            2 : static void test_plain_history_strips_osc_title(void) {
     206            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     207              : 
     208            2 :     const char msg[] = "\x1b]0;evil\x07" "X";
     209            2 :     (void)screen_put_str(&s, 0, 0, msg, 0);
     210              : 
     211              :     /* ESC becomes MIDDLE DOT; the literal "]0;evil" passes through as
     212              :      * printable ASCII (not hazardous on its own); BEL becomes MIDDLE DOT;
     213              :      * 'X' remains. */
     214            2 :     ASSERT(s.back[0].cp == 0x00B7, "ESC → U+00B7");
     215            2 :     ASSERT(s.back[1].cp == (uint32_t)']', "']'");
     216            2 :     ASSERT(s.back[2].cp == (uint32_t)'0', "'0'");
     217            2 :     ASSERT(s.back[3].cp == (uint32_t)';', "';'");
     218            2 :     ASSERT(s.back[4].cp == (uint32_t)'e', "'e'");
     219            2 :     ASSERT(s.back[5].cp == (uint32_t)'v', "'v'");
     220            2 :     ASSERT(s.back[6].cp == (uint32_t)'i', "'i'");
     221            2 :     ASSERT(s.back[7].cp == (uint32_t)'l', "'l'");
     222            2 :     ASSERT(s.back[8].cp == 0x00B7,        "BEL → U+00B7");
     223            2 :     ASSERT(s.back[9].cp == (uint32_t)'X', "trailing 'X' preserved");
     224            2 :     assert_no_raw_escape_bytes(&s, "osc_title");
     225              : 
     226            2 :     screen_drain(&s, &rs);
     227            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     228            2 :     screen_teardown(&s, &rs);
     229              : }
     230              : 
     231              : /* 3. BEL (0x07), 8-bit CSI (0xC2 0x9B in UTF-8), and DEL (0x7F). */
     232            2 : static void test_plain_history_strips_bel_and_c1(void) {
     233            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     234              : 
     235              :     /* U+009B encodes as the two UTF-8 bytes 0xC2 0x9B.  utf8_decode
     236              :      * should return cp=0x9B which the sanitiser must rewrite. */
     237            2 :     const char msg[] = "\x07" "a" "\xc2\x9b" "b" "\x7f" "c";
     238            2 :     (void)screen_put_str(&s, 0, 0, msg, 0);
     239              : 
     240            2 :     ASSERT(s.back[0].cp == 0x00B7,        "BEL → U+00B7");
     241            2 :     ASSERT(s.back[1].cp == (uint32_t)'a', "'a'");
     242            2 :     ASSERT(s.back[2].cp == 0x00B7,        "U+009B → U+00B7");
     243            2 :     ASSERT(s.back[3].cp == (uint32_t)'b', "'b'");
     244            2 :     ASSERT(s.back[4].cp == 0x00B7,        "DEL → U+00B7");
     245            2 :     ASSERT(s.back[5].cp == (uint32_t)'c', "'c'");
     246            2 :     assert_no_raw_escape_bytes(&s, "bel_c1_del");
     247              : 
     248            2 :     screen_drain(&s, &rs);
     249            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     250            2 :     screen_teardown(&s, &rs);
     251              : }
     252              : 
     253              : /* 4. \t (0x09) and \n (0x0A) are the two low-controls that SEC-01
     254              :  *    explicitly allows through — they reach terminal_wcwidth() unchanged.
     255              :  *    wcwidth() returns <=0 for both so screen_put_str skips them silently
     256              :  *    (no cell mutated, no cols consumed).  The important property is that
     257              :  *    the sanitiser did NOT replace them with U+00B7. */
     258            2 : static void test_plain_history_preserves_newline_and_tab(void) {
     259            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     260              : 
     261              :     /* Interleave the two controls with printable ASCII so the test can
     262              :      * assert the printable cells land on consecutive columns (no cell
     263              :      * was consumed by \t or \n). */
     264            2 :     const char msg[] = "a\tb\nc";
     265            2 :     int cols = screen_put_str(&s, 0, 0, msg, 0);
     266            2 :     ASSERT(cols == 3, "only 3 printable cols consumed");
     267              : 
     268            2 :     ASSERT(s.back[0].cp == (uint32_t)'a', "'a'");
     269            2 :     ASSERT(s.back[1].cp == (uint32_t)'b', "'b' directly after 'a'");
     270            2 :     ASSERT(s.back[2].cp == (uint32_t)'c', "'c' directly after 'b'");
     271              :     /* If the sanitiser had caught them, we'd see U+00B7 in the buffer. */
     272            2 :     ASSERT(count_cp(&s, 0, 0x00B7) == 0, "no MIDDLE DOT leaked");
     273              :     /* Cell 3 was never touched and stays blank. */
     274            2 :     ASSERT(s.back[3].cp == (uint32_t)' ', "cell 3 is blank");
     275              : 
     276            2 :     screen_drain(&s, &rs);
     277            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     278            2 :     screen_teardown(&s, &rs);
     279              : }
     280              : 
     281              : /* 5. Emoji smiley 😀 (U+1F600) — terminal_wcwidth returns 2. */
     282            2 : static void test_tui_pane_renders_emoji_message(void) {
     283            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     284              : 
     285              :     /* "hi " + 😀 = 3 + 2 = 5 display cols. */
     286            2 :     const char msg[] = "hi \xf0\x9f\x98\x80";
     287            2 :     int cols = screen_put_str(&s, 0, 0, msg, 0);
     288            2 :     ASSERT(cols == 5, "3 ASCII + 2 emoji = 5 cols");
     289              : 
     290            2 :     ASSERT(s.back[0].cp == (uint32_t)'h', "'h'");
     291            2 :     ASSERT(s.back[1].cp == (uint32_t)'i', "'i'");
     292            2 :     ASSERT(s.back[2].cp == (uint32_t)' ', "space");
     293            2 :     ASSERT(s.back[3].cp == 0x1F600,       "emoji lead cell");
     294            2 :     ASSERT(s.back[3].width == 2,          "emoji is wide (width 2)");
     295            2 :     ASSERT(s.back[4].cp == 0x1F600,       "emoji trailer carries same cp");
     296            2 :     ASSERT(s.back[4].width == 0,          "trailer has width 0");
     297              : 
     298            2 :     screen_drain(&s, &rs);
     299            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     300            2 :     screen_teardown(&s, &rs);
     301              : }
     302              : 
     303              : /* 6. CJK "你好" — each codepoint is width 2 → 4 display cols. */
     304            2 : static void test_tui_pane_renders_cjk_message(void) {
     305            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     306              : 
     307              :     /* U+4F60 你 = 0xE4 0xBD 0xA0
     308              :      * U+597D 好 = 0xE5 0xA5 0xBD */
     309            2 :     const char msg[] = "\xe4\xbd\xa0\xe5\xa5\xbd";
     310            2 :     int cols = screen_put_str(&s, 0, 0, msg, 0);
     311            2 :     ASSERT(cols == 4, "two CJK glyphs occupy 4 cols");
     312              : 
     313            2 :     ASSERT(s.back[0].cp == 0x4F60, "你 lead");
     314            2 :     ASSERT(s.back[0].width == 2,   "你 wide");
     315            2 :     ASSERT(s.back[1].width == 0,   "你 trailer");
     316            2 :     ASSERT(s.back[2].cp == 0x597D, "好 lead");
     317            2 :     ASSERT(s.back[2].width == 2,   "好 wide");
     318            2 :     ASSERT(s.back[3].width == 0,   "好 trailer");
     319              : 
     320            2 :     screen_drain(&s, &rs);
     321            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     322            2 :     screen_teardown(&s, &rs);
     323              : }
     324              : 
     325              : /* 7. RTL Hebrew "שלום" — rendered in logical (byte) order, no BiDi
     326              :  *    shaping, no cell corruption. */
     327            2 : static void test_tui_pane_renders_rtl_message(void) {
     328            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     329              : 
     330              :     /* U+05E9 ש, U+05DC ל, U+05D5 ו, U+05DD ם — each width 1. */
     331            2 :     const char msg[] = "\xd7\xa9\xd7\x9c\xd7\x95\xd7\x9d";
     332            2 :     int cols = screen_put_str(&s, 0, 0, msg, 0);
     333            2 :     ASSERT(cols == 4, "four RTL letters, each width 1");
     334              : 
     335            2 :     ASSERT(s.back[0].cp == 0x05E9, "ש in logical position 0");
     336            2 :     ASSERT(s.back[1].cp == 0x05DC, "ל in logical position 1");
     337            2 :     ASSERT(s.back[2].cp == 0x05D5, "ו in logical position 2");
     338            2 :     ASSERT(s.back[3].cp == 0x05DD, "ם in logical position 3");
     339            2 :     assert_no_raw_escape_bytes(&s, "rtl");
     340              : 
     341              :     /* No U+202E RLO override should appear as an uninvited codepoint
     342              :      * and — if a user truly puts RLO inside the body — it must survive
     343              :      * as a zero-width mark, not as a cell that flips anything. */
     344            2 :     ASSERT(find_cp(&s, 0, 0x202E) == -1, "no RLO leaked from elsewhere");
     345              : 
     346              :     /* Separate sub-scenario: a real U+202E in the body goes through
     347              :      * wcwidth() which returns 0, so it is silently skipped rather than
     348              :      * storing a cell that would flip neighbouring cells. */
     349            2 :     const char rlo[] = "A" "\xe2\x80\xae" "B";   /* A <RLO> B */
     350            2 :     screen_clear_back(&s);
     351            2 :     int cols2 = screen_put_str(&s, 0, 0, rlo, 0);
     352            2 :     ASSERT(cols2 == 2, "RLO itself has width 0 so only 'A' + 'B' = 2 cols");
     353            2 :     ASSERT(s.back[0].cp == (uint32_t)'A', "'A'");
     354            2 :     ASSERT(s.back[1].cp == (uint32_t)'B', "'B' follows immediately");
     355              : 
     356            2 :     screen_drain(&s, &rs);
     357            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     358            2 :     screen_teardown(&s, &rs);
     359              : }
     360              : 
     361              : /* 8. ZWJ cluster 👨‍👩‍👧 — we don't do grapheme-cluster shaping, so each
     362              :  *    constituent lands on its own cell, but no codepoint is dropped,
     363              :  *    no escape leaks, and the width is the arithmetic sum of the parts. */
     364            2 : static void test_tui_pane_zwj_cluster_stays_single_cell(void) {
     365            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     366              : 
     367              :     /* U+1F468 👨 (width 2) + U+200D ZWJ (width 0) + U+1F469 👩 (width 2)
     368              :      * + U+200D ZWJ (width 0) + U+1F467 👧 (width 2). */
     369            2 :     const char msg[] =
     370              :         "\xf0\x9f\x91\xa8" "\xe2\x80\x8d"
     371              :         "\xf0\x9f\x91\xa9" "\xe2\x80\x8d"
     372              :         "\xf0\x9f\x91\xa7";
     373            2 :     int cols = screen_put_str(&s, 0, 0, msg, 0);
     374            2 :     ASSERT(cols == 6, "three wide emoji + two zero-width ZWJ = 6 cols");
     375              : 
     376            2 :     ASSERT(s.back[0].cp == 0x1F468, "👨 lead");
     377            2 :     ASSERT(s.back[0].width == 2,    "👨 width 2");
     378            2 :     ASSERT(s.back[2].cp == 0x1F469, "👩 lead at col 2");
     379            2 :     ASSERT(s.back[4].cp == 0x1F467, "👧 lead at col 4");
     380            2 :     assert_no_raw_escape_bytes(&s, "zwj");
     381              : 
     382            2 :     screen_drain(&s, &rs);
     383            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     384            2 :     screen_teardown(&s, &rs);
     385              : }
     386              : 
     387              : /* 9. Combining mark: "e" + U+0301 (COMBINING ACUTE) renders as width 1.
     388              :  *    The two codepoints live in one cell slot + zero-width "skip"; the
     389              :  *    important contract is that the combining mark is NOT mis-rendered
     390              :  *    as its own cell that pushes "é"'s base out of alignment. */
     391            2 : static void test_tui_pane_combining_mark_width_zero(void) {
     392            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     393              : 
     394            2 :     const char msg[] = "e\xcc\x81" "xt";   /* e + U+0301 + "xt" */
     395            2 :     int cols = screen_put_str(&s, 0, 0, msg, 0);
     396            2 :     ASSERT(cols == 3, "base 'e' + 'xt' = 3 cols (combining is 0)");
     397              : 
     398            2 :     ASSERT(s.back[0].cp == (uint32_t)'e', "'e'");
     399            2 :     ASSERT(s.back[1].cp == (uint32_t)'x', "'x' directly follows (U+0301 had width 0)");
     400            2 :     ASSERT(s.back[2].cp == (uint32_t)'t', "'t'");
     401              :     /* Combining acute must not have been stored as its own cell. */
     402            2 :     ASSERT(find_cp(&s, 0, 0x0301) == -1,
     403              :            "combining mark did not consume a cell");
     404              : 
     405            2 :     screen_drain(&s, &rs);
     406            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     407            2 :     screen_teardown(&s, &rs);
     408              : }
     409              : 
     410              : /* 10. Malformed UTF-8: three bytes that cannot begin a valid sequence
     411              :  *     must become three U+FFFD while adjacent ASCII survives. */
     412            2 : static void test_malformed_utf8_replacement_char(void) {
     413            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     414              : 
     415              :     /* 0xC0 is never valid as a UTF-8 lead byte (overlong 2-byte form);
     416              :      * 0xFF and 0xFE are never valid lead bytes at all. */
     417            2 :     const char msg[] = "A" "\xc0\xff\xfe" "B";
     418            2 :     int cols = screen_put_str(&s, 0, 0, msg, 0);
     419            2 :     ASSERT(cols == 5, "A + 3×U+FFFD + B = 5 cols");
     420              : 
     421            2 :     ASSERT(s.back[0].cp == (uint32_t)'A', "leading 'A' preserved");
     422            2 :     ASSERT(s.back[1].cp == 0xFFFD,        "byte 0xC0 → U+FFFD");
     423            2 :     ASSERT(s.back[2].cp == 0xFFFD,        "byte 0xFF → U+FFFD");
     424            2 :     ASSERT(s.back[3].cp == 0xFFFD,        "byte 0xFE → U+FFFD");
     425            2 :     ASSERT(s.back[4].cp == (uint32_t)'B', "trailing 'B' preserved");
     426            2 :     assert_no_raw_escape_bytes(&s, "malformed_utf8");
     427              : 
     428            2 :     screen_drain(&s, &rs);
     429            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     430            2 :     screen_teardown(&s, &rs);
     431              : }
     432              : 
     433              : /* 11. Overlong / 5-byte sequence — the decoder has no branch for an
     434              :  *     0xF8-lead byte, so it takes the fallback and emits U+FFFD for
     435              :  *     one byte. */
     436            2 : static void test_overlong_utf8_rejected(void) {
     437            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     438              : 
     439              :     /* 0xF8 is the RFC2279 "5-byte" introducer, outlawed by RFC3629.
     440              :      * utf8_decode has no branch for an 0xF8 lead, so it falls through
     441              :      * to the generic "malformed" case: emit U+FFFD and advance by one
     442              :      * byte.  Each of the subsequent continuation bytes (0x88, 0x80…)
     443              :      * also has no valid position on its own and therefore yields its
     444              :      * own U+FFFD.  Result: 5 invalid bytes → 5 U+FFFD cells. */
     445            2 :     const char msg[] = "[" "\xf8\x88\x80\x80\x80" "]";
     446            2 :     int cols = screen_put_str(&s, 0, 0, msg, 0);
     447              :     /* '[' + 5 × U+FFFD + ']' = 7 cols. */
     448            2 :     ASSERT(cols == 7, "bracket + 5 U+FFFD cells + bracket");
     449              : 
     450            2 :     ASSERT(s.back[0].cp == (uint32_t)'[', "leading '['");
     451            2 :     ASSERT(s.back[1].cp == 0xFFFD,        "0xF8 lead → U+FFFD");
     452            2 :     ASSERT(s.back[6].cp == (uint32_t)']', "trailing ']'");
     453            2 :     assert_no_raw_escape_bytes(&s, "overlong");
     454              : 
     455              :     /* Every rejected byte must materialise as U+FFFD (not as a raw
     456              :      * control nor as a zero-width skipped cell). */
     457           12 :     for (int c = 1; c <= 5; c++) {
     458           10 :         ASSERT(s.back[c].cp == 0xFFFD,
     459              :                "every rejected byte materialised as U+FFFD");
     460              :     }
     461              : 
     462            2 :     screen_drain(&s, &rs);
     463            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     464            2 :     screen_teardown(&s, &rs);
     465              : }
     466              : 
     467              : /* 12. UTF-8 BOM (U+FEFF) mid-message: width 0, so it is silently
     468              :  *     skipped — never rendered as the four-byte "" garbage.  The
     469              :  *     adjacent printable text stays on consecutive columns. */
     470            2 : static void test_utf8_bom_zero_width(void) {
     471            2 :     Screen s; RenderSink rs; screen_setup(&s, &rs);
     472              : 
     473              :     /* U+FEFF = 0xEF 0xBB 0xBF. */
     474            2 :     const char msg[] = "a\xef\xbb\xbf" "b";
     475            2 :     int cols = screen_put_str(&s, 0, 0, msg, 0);
     476            2 :     ASSERT(cols == 2, "BOM is zero-width: only 'a' and 'b' consume cols");
     477              : 
     478            2 :     ASSERT(s.back[0].cp == (uint32_t)'a', "'a'");
     479            2 :     ASSERT(s.back[1].cp == (uint32_t)'b', "'b' follows directly after BOM");
     480            2 :     ASSERT(find_cp(&s, 0, 0xFEFF) == -1,  "BOM did not materialise as a cell");
     481              :     /* And no garbage escape sequence byte survived either. */
     482            2 :     assert_no_raw_escape_bytes(&s, "bom");
     483              : 
     484            2 :     screen_drain(&s, &rs);
     485            2 :     assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
     486            2 :     screen_teardown(&s, &rs);
     487              : }
     488              : 
     489              : /* ================================================================ */
     490              : /* Suite entry point                                                */
     491              : /* ================================================================ */
     492              : 
     493            2 : void run_text_rendering_safety_tests(void) {
     494            2 :     RUN_TEST(test_plain_history_strips_ansi_csi);
     495            2 :     RUN_TEST(test_plain_history_strips_osc_title);
     496            2 :     RUN_TEST(test_plain_history_strips_bel_and_c1);
     497            2 :     RUN_TEST(test_plain_history_preserves_newline_and_tab);
     498            2 :     RUN_TEST(test_tui_pane_renders_emoji_message);
     499            2 :     RUN_TEST(test_tui_pane_renders_cjk_message);
     500            2 :     RUN_TEST(test_tui_pane_renders_rtl_message);
     501            2 :     RUN_TEST(test_tui_pane_zwj_cluster_stays_single_cell);
     502            2 :     RUN_TEST(test_tui_pane_combining_mark_width_zero);
     503            2 :     RUN_TEST(test_malformed_utf8_replacement_char);
     504            2 :     RUN_TEST(test_overlong_utf8_rejected);
     505            2 :     RUN_TEST(test_utf8_bom_zero_width);
     506            2 : }
        

Generated by: LCOV version 2.0-1