Line data Source code
1 : /**
2 : * @file test_text_rendering_safety.c
3 : * @brief TEST-78 / US-27 — Unicode + ANSI-escape safety for screen rendering.
4 : *
5 : * The subject is src/tui/screen.c::screen_put_str[_n], which
6 : * 1. decodes UTF-8 (utf8_decode),
7 : * 2. rewrites hazardous control codepoints to U+00B7 MIDDLE DOT
8 : * (SEC-01 sanitisation block, screen.c L111-119), and
9 : * 3. stores the resulting codepoint + display-width in the Screen
10 : * back buffer via terminal_wcwidth().
11 : *
12 : * All three steps are pure: given a UTF-8 string, the only observable
13 : * side-effect is the mutation of Screen::back[]. No socket, MTProto,
14 : * mock server, or TL parsing is required — the functional test drives
15 : * the production domain end-to-end by writing a message body verbatim
16 : * into a Screen and inspecting the resulting cell grid.
17 : *
18 : * Scenarios covered (12 tests):
19 : * 1. ANSI CSI erase-display (ESC [ 2 J)
20 : * 2. OSC title-set (ESC ] 0 ; evil BEL)
21 : * 3. BEL + 8-bit CSI + DEL (0x07, 0x9B, 0x7F)
22 : * 4. \t and \n are preserved (only allowed low-controls)
23 : * 5. Emoji — smiley (U+1F600, width 2)
24 : * 6. CJK ("你好", 4 cols)
25 : * 7. RTL — Hebrew "שלום" (stored LTR in logical order)
26 : * 8. Zero-width joiner (👨👩👧: ZWJ family cluster)
27 : * 9. Combining mark ("e" + U+0301 → width 1)
28 : * 10. Malformed UTF-8 (three invalid bytes → three U+FFFD)
29 : * 11. Overlong UTF-8 (5-byte F8-lead → U+FFFD)
30 : * 12. UTF-8 BOM inside body (U+FEFF is zero-width, not garbage)
31 : *
32 : * Assertion helpers:
33 : * assert_no_raw_escape_bytes — scans each cell for codepoints 0x1B,
34 : * 0x07, 0x9B, 0x7F; none must survive sanitisation.
35 : * find_cp / count_cp — scan row for codepoint matches.
36 : *
37 : * TEST-55 remains the dialog-title width-only variant; this suite adds
38 : * the missing message-body + injection + malformed-UTF-8 coverage that
39 : * TEST-55 did not reach.
40 : */
41 :
42 : #include "test_helpers.h"
43 :
44 : #include "tui/screen.h"
45 : #include "platform/terminal.h"
46 :
47 : #include <locale.h>
48 : #include <stdint.h>
49 : #include <stdio.h>
50 : #include <stdlib.h>
51 : #include <string.h>
52 :
53 : /* ---- shared helpers ---- */
54 :
55 : /**
56 : * Fail the test if any cell in @p s carries a raw control-character
57 : * codepoint that SEC-01 was supposed to neutralise.
58 : *
59 : * The four bytes targeted are exactly the ones a malicious Telegram
60 : * message could use to break out of our display area:
61 : * 0x1B ESC — lead byte of every ANSI CSI / OSC / DEC sequence.
62 : * 0x07 BEL — audible bell, and terminator of OSC title sequences.
63 : * 0x9B CSI — 8-bit introducer equivalent to "ESC [".
64 : * 0x7F DEL — some terminals interpret as erase-under-cursor.
65 : */
66 16 : static void assert_no_raw_escape_bytes(const Screen *s, const char *where) {
67 16 : int total = s->rows * s->cols;
68 1040 : for (int i = 0; i < total; i++) {
69 1024 : uint32_t cp = s->back[i].cp;
70 1024 : if (cp == 0x1B || cp == 0x07 || cp == 0x9B || cp == 0x7F) {
71 0 : printf(" [FAIL] %s: raw control cp=0x%02X at cell %d\n",
72 : where, (unsigned)cp, i);
73 0 : g_tests_failed++;
74 0 : return;
75 : }
76 : }
77 : }
78 :
79 : /**
80 : * After screen_flip, scan the emitted byte stream for 0x07 / 0x9B / 0x7F
81 : * — these would only reach stdout if the sanitiser had failed, because
82 : * the flipper never emits those bytes itself (it only emits ESC for
83 : * CUP/SGR/DECTCEM sequences, which is expected and therefore excluded
84 : * from the check).
85 : *
86 : * The 0x1B byte is deliberately NOT flagged here: screen_flip always
87 : * writes CUP ("\033[…H") and SGR ("\033[…m") framing before UTF-8
88 : * cell contents. We check for the non-framing escape bytes that
89 : * should NEVER appear in our output.
90 : */
91 24 : static void assert_no_user_escape_in_stream(const char *buf, size_t len,
92 : const char *where) {
93 1864 : for (size_t i = 0; i < len; i++) {
94 1840 : unsigned char b = (unsigned char)buf[i];
95 1840 : if (b == 0x07 || b == 0x9B || b == 0x7F) {
96 0 : printf(" [FAIL] %s: raw byte 0x%02X at stream offset %zu\n",
97 : where, (unsigned)b, i);
98 0 : g_tests_failed++;
99 0 : return;
100 : }
101 : }
102 : }
103 :
104 : /** Return the column of the first cell in @p row whose cp == @p needle,
105 : * or -1 if no such cell exists. */
106 6 : static int find_cp(const Screen *s, int row, uint32_t needle) {
107 390 : for (int c = 0; c < s->cols; c++) {
108 384 : if (s->back[(size_t)row * s->cols + c].cp == needle) return c;
109 : }
110 6 : return -1;
111 : }
112 :
113 : /** Count occurrences of codepoint @p needle on @p row. */
114 2 : static int count_cp(const Screen *s, int row, uint32_t needle) {
115 2 : int n = 0;
116 130 : for (int c = 0; c < s->cols; c++) {
117 128 : if (s->back[(size_t)row * s->cols + c].cp == needle) n++;
118 : }
119 2 : return n;
120 : }
121 :
122 : /**
123 : * Per-test scratch buffer backed by open_memstream() so each test can
124 : * examine the exact bytes screen_flip emits (and confirm no raw
125 : * escape payload leaks into the terminal stream).
126 : */
127 : typedef struct {
128 : char *buf;
129 : size_t len;
130 : FILE *out;
131 : } RenderSink;
132 :
133 : /** Small screen used by every test: single row is easiest to reason about
134 : * and still exercises utf8_decode + sanitiser + wcwidth. */
135 24 : static void screen_setup(Screen *s, RenderSink *rs) {
136 : /* terminal_wcwidth() delegates to POSIX wcwidth(3), which on glibc
137 : * returns -1 for any non-ASCII codepoint under the default "C" locale.
138 : * Enable the environment locale (typically *.UTF-8) so wide/narrow
139 : * classification works for emoji, CJK, combining marks, etc. The
140 : * TEST-55 sibling test in tests/unit/test_platform.c does the same. */
141 24 : setlocale(LC_ALL, "");
142 :
143 : /* 1 row × 64 cols is enough for the longest fixture (the CSI test). */
144 24 : int rc = screen_init(s, 1, 64);
145 24 : if (rc != 0) {
146 0 : printf(" [FATAL] screen_init failed\n");
147 0 : abort();
148 : }
149 : /* Route the ANSI byte stream into a memstream so each test can
150 : * both (a) see exactly what screen_flip emits and (b) exercise
151 : * the utf8_encode / sgr_encode / cup_encode / screen_flip paths
152 : * — none of which the back-buffer inspection alone would cover. */
153 24 : rs->buf = NULL; rs->len = 0;
154 24 : rs->out = open_memstream(&rs->buf, &rs->len);
155 24 : if (!rs->out) {
156 0 : printf(" [FATAL] open_memstream failed\n");
157 0 : abort();
158 : }
159 24 : s->out = rs->out;
160 24 : }
161 :
162 24 : static void screen_teardown(Screen *s, RenderSink *rs) {
163 24 : if (rs->out) { fclose(rs->out); rs->out = NULL; }
164 24 : free(rs->buf); rs->buf = NULL; rs->len = 0;
165 24 : screen_free(s);
166 24 : }
167 :
168 : /** Flush the back buffer through screen_flip and return the bytes the
169 : * flipper wrote to the memstream. Must be called before inspecting
170 : * rs->buf / rs->len. */
171 24 : static void screen_drain(Screen *s, RenderSink *rs) {
172 24 : (void)screen_flip(s);
173 24 : fflush(rs->out);
174 24 : }
175 :
176 : /* ================================================================ */
177 : /* Tests */
178 : /* ================================================================ */
179 :
180 : /* 1. ANSI CSI erase-display inside a message body must become MIDDLE DOTs. */
181 2 : static void test_plain_history_strips_ansi_csi(void) {
182 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
183 :
184 : /* "A" + ESC + "[2J" + "B" — ESC is U+001B which the sanitiser rewrites
185 : * to U+00B7 MIDDLE DOT (width 1 in a UTF-8 locale); the other bytes
186 : * are printable ASCII that pass through untouched. */
187 2 : const char msg[] = "A\x1b[2JB";
188 2 : int cols = screen_put_str(&s, 0, 0, msg, 0);
189 2 : ASSERT(cols == 6, "6 cols: A + MIDDLE_DOT + [ + 2 + J + B");
190 :
191 2 : ASSERT(s.back[0].cp == (uint32_t)'A', "cell 0 is 'A'");
192 2 : ASSERT(s.back[1].cp == 0x00B7, "cell 1 is U+00B7 (ESC replaced)");
193 2 : ASSERT(s.back[2].cp == (uint32_t)'[', "cell 2 is '['");
194 2 : ASSERT(s.back[3].cp == (uint32_t)'2', "cell 3 is '2'");
195 2 : ASSERT(s.back[4].cp == (uint32_t)'J', "cell 4 is 'J'");
196 2 : ASSERT(s.back[5].cp == (uint32_t)'B', "cell 5 is 'B'");
197 2 : assert_no_raw_escape_bytes(&s, "ansi_csi");
198 :
199 2 : screen_drain(&s, &rs);
200 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
201 2 : screen_teardown(&s, &rs);
202 : }
203 :
204 : /* 2. OSC title-set sequence: ESC ]0;evil BEL */
205 2 : static void test_plain_history_strips_osc_title(void) {
206 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
207 :
208 2 : const char msg[] = "\x1b]0;evil\x07" "X";
209 2 : (void)screen_put_str(&s, 0, 0, msg, 0);
210 :
211 : /* ESC becomes MIDDLE DOT; the literal "]0;evil" passes through as
212 : * printable ASCII (not hazardous on its own); BEL becomes MIDDLE DOT;
213 : * 'X' remains. */
214 2 : ASSERT(s.back[0].cp == 0x00B7, "ESC → U+00B7");
215 2 : ASSERT(s.back[1].cp == (uint32_t)']', "']'");
216 2 : ASSERT(s.back[2].cp == (uint32_t)'0', "'0'");
217 2 : ASSERT(s.back[3].cp == (uint32_t)';', "';'");
218 2 : ASSERT(s.back[4].cp == (uint32_t)'e', "'e'");
219 2 : ASSERT(s.back[5].cp == (uint32_t)'v', "'v'");
220 2 : ASSERT(s.back[6].cp == (uint32_t)'i', "'i'");
221 2 : ASSERT(s.back[7].cp == (uint32_t)'l', "'l'");
222 2 : ASSERT(s.back[8].cp == 0x00B7, "BEL → U+00B7");
223 2 : ASSERT(s.back[9].cp == (uint32_t)'X', "trailing 'X' preserved");
224 2 : assert_no_raw_escape_bytes(&s, "osc_title");
225 :
226 2 : screen_drain(&s, &rs);
227 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
228 2 : screen_teardown(&s, &rs);
229 : }
230 :
231 : /* 3. BEL (0x07), 8-bit CSI (0xC2 0x9B in UTF-8), and DEL (0x7F). */
232 2 : static void test_plain_history_strips_bel_and_c1(void) {
233 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
234 :
235 : /* U+009B encodes as the two UTF-8 bytes 0xC2 0x9B. utf8_decode
236 : * should return cp=0x9B which the sanitiser must rewrite. */
237 2 : const char msg[] = "\x07" "a" "\xc2\x9b" "b" "\x7f" "c";
238 2 : (void)screen_put_str(&s, 0, 0, msg, 0);
239 :
240 2 : ASSERT(s.back[0].cp == 0x00B7, "BEL → U+00B7");
241 2 : ASSERT(s.back[1].cp == (uint32_t)'a', "'a'");
242 2 : ASSERT(s.back[2].cp == 0x00B7, "U+009B → U+00B7");
243 2 : ASSERT(s.back[3].cp == (uint32_t)'b', "'b'");
244 2 : ASSERT(s.back[4].cp == 0x00B7, "DEL → U+00B7");
245 2 : ASSERT(s.back[5].cp == (uint32_t)'c', "'c'");
246 2 : assert_no_raw_escape_bytes(&s, "bel_c1_del");
247 :
248 2 : screen_drain(&s, &rs);
249 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
250 2 : screen_teardown(&s, &rs);
251 : }
252 :
253 : /* 4. \t (0x09) and \n (0x0A) are the two low-controls that SEC-01
254 : * explicitly allows through — they reach terminal_wcwidth() unchanged.
255 : * wcwidth() returns <=0 for both so screen_put_str skips them silently
256 : * (no cell mutated, no cols consumed). The important property is that
257 : * the sanitiser did NOT replace them with U+00B7. */
258 2 : static void test_plain_history_preserves_newline_and_tab(void) {
259 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
260 :
261 : /* Interleave the two controls with printable ASCII so the test can
262 : * assert the printable cells land on consecutive columns (no cell
263 : * was consumed by \t or \n). */
264 2 : const char msg[] = "a\tb\nc";
265 2 : int cols = screen_put_str(&s, 0, 0, msg, 0);
266 2 : ASSERT(cols == 3, "only 3 printable cols consumed");
267 :
268 2 : ASSERT(s.back[0].cp == (uint32_t)'a', "'a'");
269 2 : ASSERT(s.back[1].cp == (uint32_t)'b', "'b' directly after 'a'");
270 2 : ASSERT(s.back[2].cp == (uint32_t)'c', "'c' directly after 'b'");
271 : /* If the sanitiser had caught them, we'd see U+00B7 in the buffer. */
272 2 : ASSERT(count_cp(&s, 0, 0x00B7) == 0, "no MIDDLE DOT leaked");
273 : /* Cell 3 was never touched and stays blank. */
274 2 : ASSERT(s.back[3].cp == (uint32_t)' ', "cell 3 is blank");
275 :
276 2 : screen_drain(&s, &rs);
277 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
278 2 : screen_teardown(&s, &rs);
279 : }
280 :
281 : /* 5. Emoji smiley 😀 (U+1F600) — terminal_wcwidth returns 2. */
282 2 : static void test_tui_pane_renders_emoji_message(void) {
283 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
284 :
285 : /* "hi " + 😀 = 3 + 2 = 5 display cols. */
286 2 : const char msg[] = "hi \xf0\x9f\x98\x80";
287 2 : int cols = screen_put_str(&s, 0, 0, msg, 0);
288 2 : ASSERT(cols == 5, "3 ASCII + 2 emoji = 5 cols");
289 :
290 2 : ASSERT(s.back[0].cp == (uint32_t)'h', "'h'");
291 2 : ASSERT(s.back[1].cp == (uint32_t)'i', "'i'");
292 2 : ASSERT(s.back[2].cp == (uint32_t)' ', "space");
293 2 : ASSERT(s.back[3].cp == 0x1F600, "emoji lead cell");
294 2 : ASSERT(s.back[3].width == 2, "emoji is wide (width 2)");
295 2 : ASSERT(s.back[4].cp == 0x1F600, "emoji trailer carries same cp");
296 2 : ASSERT(s.back[4].width == 0, "trailer has width 0");
297 :
298 2 : screen_drain(&s, &rs);
299 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
300 2 : screen_teardown(&s, &rs);
301 : }
302 :
303 : /* 6. CJK "你好" — each codepoint is width 2 → 4 display cols. */
304 2 : static void test_tui_pane_renders_cjk_message(void) {
305 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
306 :
307 : /* U+4F60 你 = 0xE4 0xBD 0xA0
308 : * U+597D 好 = 0xE5 0xA5 0xBD */
309 2 : const char msg[] = "\xe4\xbd\xa0\xe5\xa5\xbd";
310 2 : int cols = screen_put_str(&s, 0, 0, msg, 0);
311 2 : ASSERT(cols == 4, "two CJK glyphs occupy 4 cols");
312 :
313 2 : ASSERT(s.back[0].cp == 0x4F60, "你 lead");
314 2 : ASSERT(s.back[0].width == 2, "你 wide");
315 2 : ASSERT(s.back[1].width == 0, "你 trailer");
316 2 : ASSERT(s.back[2].cp == 0x597D, "好 lead");
317 2 : ASSERT(s.back[2].width == 2, "好 wide");
318 2 : ASSERT(s.back[3].width == 0, "好 trailer");
319 :
320 2 : screen_drain(&s, &rs);
321 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
322 2 : screen_teardown(&s, &rs);
323 : }
324 :
325 : /* 7. RTL Hebrew "שלום" — rendered in logical (byte) order, no BiDi
326 : * shaping, no cell corruption. */
327 2 : static void test_tui_pane_renders_rtl_message(void) {
328 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
329 :
330 : /* U+05E9 ש, U+05DC ל, U+05D5 ו, U+05DD ם — each width 1. */
331 2 : const char msg[] = "\xd7\xa9\xd7\x9c\xd7\x95\xd7\x9d";
332 2 : int cols = screen_put_str(&s, 0, 0, msg, 0);
333 2 : ASSERT(cols == 4, "four RTL letters, each width 1");
334 :
335 2 : ASSERT(s.back[0].cp == 0x05E9, "ש in logical position 0");
336 2 : ASSERT(s.back[1].cp == 0x05DC, "ל in logical position 1");
337 2 : ASSERT(s.back[2].cp == 0x05D5, "ו in logical position 2");
338 2 : ASSERT(s.back[3].cp == 0x05DD, "ם in logical position 3");
339 2 : assert_no_raw_escape_bytes(&s, "rtl");
340 :
341 : /* No U+202E RLO override should appear as an uninvited codepoint
342 : * and — if a user truly puts RLO inside the body — it must survive
343 : * as a zero-width mark, not as a cell that flips anything. */
344 2 : ASSERT(find_cp(&s, 0, 0x202E) == -1, "no RLO leaked from elsewhere");
345 :
346 : /* Separate sub-scenario: a real U+202E in the body goes through
347 : * wcwidth() which returns 0, so it is silently skipped rather than
348 : * storing a cell that would flip neighbouring cells. */
349 2 : const char rlo[] = "A" "\xe2\x80\xae" "B"; /* A <RLO> B */
350 2 : screen_clear_back(&s);
351 2 : int cols2 = screen_put_str(&s, 0, 0, rlo, 0);
352 2 : ASSERT(cols2 == 2, "RLO itself has width 0 so only 'A' + 'B' = 2 cols");
353 2 : ASSERT(s.back[0].cp == (uint32_t)'A', "'A'");
354 2 : ASSERT(s.back[1].cp == (uint32_t)'B', "'B' follows immediately");
355 :
356 2 : screen_drain(&s, &rs);
357 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
358 2 : screen_teardown(&s, &rs);
359 : }
360 :
361 : /* 8. ZWJ cluster 👨👩👧 — we don't do grapheme-cluster shaping, so each
362 : * constituent lands on its own cell, but no codepoint is dropped,
363 : * no escape leaks, and the width is the arithmetic sum of the parts. */
364 2 : static void test_tui_pane_zwj_cluster_stays_single_cell(void) {
365 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
366 :
367 : /* U+1F468 👨 (width 2) + U+200D ZWJ (width 0) + U+1F469 👩 (width 2)
368 : * + U+200D ZWJ (width 0) + U+1F467 👧 (width 2). */
369 2 : const char msg[] =
370 : "\xf0\x9f\x91\xa8" "\xe2\x80\x8d"
371 : "\xf0\x9f\x91\xa9" "\xe2\x80\x8d"
372 : "\xf0\x9f\x91\xa7";
373 2 : int cols = screen_put_str(&s, 0, 0, msg, 0);
374 2 : ASSERT(cols == 6, "three wide emoji + two zero-width ZWJ = 6 cols");
375 :
376 2 : ASSERT(s.back[0].cp == 0x1F468, "👨 lead");
377 2 : ASSERT(s.back[0].width == 2, "👨 width 2");
378 2 : ASSERT(s.back[2].cp == 0x1F469, "👩 lead at col 2");
379 2 : ASSERT(s.back[4].cp == 0x1F467, "👧 lead at col 4");
380 2 : assert_no_raw_escape_bytes(&s, "zwj");
381 :
382 2 : screen_drain(&s, &rs);
383 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
384 2 : screen_teardown(&s, &rs);
385 : }
386 :
387 : /* 9. Combining mark: "e" + U+0301 (COMBINING ACUTE) renders as width 1.
388 : * The two codepoints live in one cell slot + zero-width "skip"; the
389 : * important contract is that the combining mark is NOT mis-rendered
390 : * as its own cell that pushes "é"'s base out of alignment. */
391 2 : static void test_tui_pane_combining_mark_width_zero(void) {
392 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
393 :
394 2 : const char msg[] = "e\xcc\x81" "xt"; /* e + U+0301 + "xt" */
395 2 : int cols = screen_put_str(&s, 0, 0, msg, 0);
396 2 : ASSERT(cols == 3, "base 'e' + 'xt' = 3 cols (combining is 0)");
397 :
398 2 : ASSERT(s.back[0].cp == (uint32_t)'e', "'e'");
399 2 : ASSERT(s.back[1].cp == (uint32_t)'x', "'x' directly follows (U+0301 had width 0)");
400 2 : ASSERT(s.back[2].cp == (uint32_t)'t', "'t'");
401 : /* Combining acute must not have been stored as its own cell. */
402 2 : ASSERT(find_cp(&s, 0, 0x0301) == -1,
403 : "combining mark did not consume a cell");
404 :
405 2 : screen_drain(&s, &rs);
406 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
407 2 : screen_teardown(&s, &rs);
408 : }
409 :
410 : /* 10. Malformed UTF-8: three bytes that cannot begin a valid sequence
411 : * must become three U+FFFD while adjacent ASCII survives. */
412 2 : static void test_malformed_utf8_replacement_char(void) {
413 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
414 :
415 : /* 0xC0 is never valid as a UTF-8 lead byte (overlong 2-byte form);
416 : * 0xFF and 0xFE are never valid lead bytes at all. */
417 2 : const char msg[] = "A" "\xc0\xff\xfe" "B";
418 2 : int cols = screen_put_str(&s, 0, 0, msg, 0);
419 2 : ASSERT(cols == 5, "A + 3×U+FFFD + B = 5 cols");
420 :
421 2 : ASSERT(s.back[0].cp == (uint32_t)'A', "leading 'A' preserved");
422 2 : ASSERT(s.back[1].cp == 0xFFFD, "byte 0xC0 → U+FFFD");
423 2 : ASSERT(s.back[2].cp == 0xFFFD, "byte 0xFF → U+FFFD");
424 2 : ASSERT(s.back[3].cp == 0xFFFD, "byte 0xFE → U+FFFD");
425 2 : ASSERT(s.back[4].cp == (uint32_t)'B', "trailing 'B' preserved");
426 2 : assert_no_raw_escape_bytes(&s, "malformed_utf8");
427 :
428 2 : screen_drain(&s, &rs);
429 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
430 2 : screen_teardown(&s, &rs);
431 : }
432 :
433 : /* 11. Overlong / 5-byte sequence — the decoder has no branch for an
434 : * 0xF8-lead byte, so it takes the fallback and emits U+FFFD for
435 : * one byte. */
436 2 : static void test_overlong_utf8_rejected(void) {
437 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
438 :
439 : /* 0xF8 is the RFC2279 "5-byte" introducer, outlawed by RFC3629.
440 : * utf8_decode has no branch for an 0xF8 lead, so it falls through
441 : * to the generic "malformed" case: emit U+FFFD and advance by one
442 : * byte. Each of the subsequent continuation bytes (0x88, 0x80…)
443 : * also has no valid position on its own and therefore yields its
444 : * own U+FFFD. Result: 5 invalid bytes → 5 U+FFFD cells. */
445 2 : const char msg[] = "[" "\xf8\x88\x80\x80\x80" "]";
446 2 : int cols = screen_put_str(&s, 0, 0, msg, 0);
447 : /* '[' + 5 × U+FFFD + ']' = 7 cols. */
448 2 : ASSERT(cols == 7, "bracket + 5 U+FFFD cells + bracket");
449 :
450 2 : ASSERT(s.back[0].cp == (uint32_t)'[', "leading '['");
451 2 : ASSERT(s.back[1].cp == 0xFFFD, "0xF8 lead → U+FFFD");
452 2 : ASSERT(s.back[6].cp == (uint32_t)']', "trailing ']'");
453 2 : assert_no_raw_escape_bytes(&s, "overlong");
454 :
455 : /* Every rejected byte must materialise as U+FFFD (not as a raw
456 : * control nor as a zero-width skipped cell). */
457 12 : for (int c = 1; c <= 5; c++) {
458 10 : ASSERT(s.back[c].cp == 0xFFFD,
459 : "every rejected byte materialised as U+FFFD");
460 : }
461 :
462 2 : screen_drain(&s, &rs);
463 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
464 2 : screen_teardown(&s, &rs);
465 : }
466 :
467 : /* 12. UTF-8 BOM (U+FEFF) mid-message: width 0, so it is silently
468 : * skipped — never rendered as the four-byte "" garbage. The
469 : * adjacent printable text stays on consecutive columns. */
470 2 : static void test_utf8_bom_zero_width(void) {
471 2 : Screen s; RenderSink rs; screen_setup(&s, &rs);
472 :
473 : /* U+FEFF = 0xEF 0xBB 0xBF. */
474 2 : const char msg[] = "a\xef\xbb\xbf" "b";
475 2 : int cols = screen_put_str(&s, 0, 0, msg, 0);
476 2 : ASSERT(cols == 2, "BOM is zero-width: only 'a' and 'b' consume cols");
477 :
478 2 : ASSERT(s.back[0].cp == (uint32_t)'a', "'a'");
479 2 : ASSERT(s.back[1].cp == (uint32_t)'b', "'b' follows directly after BOM");
480 2 : ASSERT(find_cp(&s, 0, 0xFEFF) == -1, "BOM did not materialise as a cell");
481 : /* And no garbage escape sequence byte survived either. */
482 2 : assert_no_raw_escape_bytes(&s, "bom");
483 :
484 2 : screen_drain(&s, &rs);
485 2 : assert_no_user_escape_in_stream(rs.buf, rs.len, __func__);
486 2 : screen_teardown(&s, &rs);
487 : }
488 :
489 : /* ================================================================ */
490 : /* Suite entry point */
491 : /* ================================================================ */
492 :
493 2 : void run_text_rendering_safety_tests(void) {
494 2 : RUN_TEST(test_plain_history_strips_ansi_csi);
495 2 : RUN_TEST(test_plain_history_strips_osc_title);
496 2 : RUN_TEST(test_plain_history_strips_bel_and_c1);
497 2 : RUN_TEST(test_plain_history_preserves_newline_and_tab);
498 2 : RUN_TEST(test_tui_pane_renders_emoji_message);
499 2 : RUN_TEST(test_tui_pane_renders_cjk_message);
500 2 : RUN_TEST(test_tui_pane_renders_rtl_message);
501 2 : RUN_TEST(test_tui_pane_zwj_cluster_stays_single_cell);
502 2 : RUN_TEST(test_tui_pane_combining_mark_width_zero);
503 2 : RUN_TEST(test_malformed_utf8_replacement_char);
504 2 : RUN_TEST(test_overlong_utf8_rejected);
505 2 : RUN_TEST(test_utf8_bom_zero_width);
506 2 : }
|