Line data Source code
1 : #include "test_helpers.h"
2 : #include "html_parser.h"
3 : #include "raii.h"
4 : #include <string.h>
5 : #include <stdlib.h>
6 :
7 1 : void test_html_parser(void) {
8 :
9 : /* 1. NULL input → NULL */
10 : {
11 1 : HtmlNode *r = html_parse(NULL);
12 1 : ASSERT(r == NULL, "html_parse(NULL) should return NULL");
13 : }
14 :
15 : /* 2. Empty string → root with no children */
16 : {
17 1 : HtmlNode *r = html_parse("");
18 1 : ASSERT(r != NULL, "html_parse empty: root not NULL");
19 1 : ASSERT(r->first_child == NULL, "html_parse empty: no children");
20 1 : html_node_free(r);
21 : }
22 :
23 : /* 3. Plain text → single TEXT child */
24 : {
25 1 : HtmlNode *r = html_parse("Hello World");
26 1 : ASSERT(r != NULL, "plain text: root not NULL");
27 1 : HtmlNode *c = r->first_child;
28 1 : ASSERT(c != NULL, "plain text: has child");
29 1 : ASSERT(c->type == HTML_NODE_TEXT, "plain text: child is TEXT");
30 1 : ASSERT(strcmp(c->text, "Hello World") == 0, "plain text: content matches");
31 1 : html_node_free(r);
32 : }
33 :
34 : /* 4. <b>text</b> → ELEMENT(b) → TEXT child */
35 : {
36 1 : HtmlNode *r = html_parse("<b>text</b>");
37 1 : ASSERT(r != NULL, "b: root not NULL");
38 1 : HtmlNode *b = r->first_child;
39 1 : ASSERT(b != NULL, "b: element present");
40 1 : ASSERT(b->type == HTML_NODE_ELEMENT, "b: type ELEMENT");
41 1 : ASSERT(strcmp(b->tag, "b") == 0, "b: tag is 'b'");
42 1 : HtmlNode *txt = b->first_child;
43 1 : ASSERT(txt != NULL, "b: has text child");
44 1 : ASSERT(txt->type == HTML_NODE_TEXT, "b: child is TEXT");
45 1 : ASSERT(strcmp(txt->text, "text") == 0, "b: text content matches");
46 1 : html_node_free(r);
47 : }
48 :
49 : /* 5. Void element <br> — no children */
50 : {
51 1 : HtmlNode *r = html_parse("<br>");
52 1 : ASSERT(r != NULL, "br: root not NULL");
53 1 : HtmlNode *br = r->first_child;
54 1 : ASSERT(br != NULL, "br: element present");
55 1 : ASSERT(strcmp(br->tag, "br") == 0, "br: tag is 'br'");
56 1 : ASSERT(br->first_child == NULL, "br: no children");
57 1 : html_node_free(r);
58 : }
59 :
60 : /* 6. Self-closing <br/> — no children */
61 : {
62 1 : HtmlNode *r = html_parse("<br/>");
63 1 : ASSERT(r != NULL, "br/: root not NULL");
64 1 : HtmlNode *br = r->first_child;
65 1 : ASSERT(br != NULL, "br/: element present");
66 1 : ASSERT(br->first_child == NULL, "br/: no children");
67 1 : html_node_free(r);
68 : }
69 :
70 : /* 7. Attributes: double-quoted */
71 : {
72 1 : HtmlNode *r = html_parse("<img src=\"foo.png\" alt=\"bar\">");
73 1 : ASSERT(r != NULL, "attrs: root not NULL");
74 1 : HtmlNode *img = r->first_child;
75 1 : ASSERT(img != NULL, "attrs: img present");
76 1 : const char *src = html_attr_get(img, "src");
77 1 : const char *alt = html_attr_get(img, "alt");
78 1 : ASSERT(src && strcmp(src, "foo.png") == 0, "attrs: src matches");
79 1 : ASSERT(alt && strcmp(alt, "bar") == 0, "attrs: alt matches");
80 1 : html_node_free(r);
81 : }
82 :
83 : /* 8. Attributes: unquoted */
84 : {
85 1 : HtmlNode *r = html_parse("<input type=text>");
86 1 : ASSERT(r != NULL, "unquoted attr: root not NULL");
87 1 : HtmlNode *inp = r->first_child;
88 1 : ASSERT(inp != NULL, "unquoted attr: input present");
89 1 : const char *type = html_attr_get(inp, "type");
90 1 : ASSERT(type && strcmp(type, "text") == 0, "unquoted attr: type matches");
91 1 : html_node_free(r);
92 : }
93 :
94 : /* 9. Boolean attribute */
95 : {
96 1 : HtmlNode *r = html_parse("<input disabled>");
97 1 : ASSERT(r != NULL, "bool attr: root not NULL");
98 1 : HtmlNode *inp = r->first_child;
99 1 : ASSERT(inp != NULL, "bool attr: input present");
100 1 : const char *dis = html_attr_get(inp, "disabled");
101 1 : ASSERT(dis != NULL, "bool attr: disabled present");
102 1 : html_node_free(r);
103 : }
104 :
105 : /* 10. Entity: & → & */
106 : {
107 1 : HtmlNode *r = html_parse("a & b");
108 1 : ASSERT(r != NULL, "entity amp: root not NULL");
109 1 : HtmlNode *c = r->first_child;
110 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "entity amp: text node");
111 1 : ASSERT(strcmp(c->text, "a & b") == 0, "entity amp: decoded");
112 1 : html_node_free(r);
113 : }
114 :
115 : /* 11. Entity: < → < */
116 : {
117 1 : HtmlNode *r = html_parse("<tag>");
118 1 : ASSERT(r != NULL, "entity lt gt: root not NULL");
119 1 : HtmlNode *c = r->first_child;
120 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "entity lt gt: text node");
121 1 : ASSERT(strcmp(c->text, "<tag>") == 0, "entity lt gt: decoded");
122 1 : html_node_free(r);
123 : }
124 :
125 : /* 12. Entity: → UTF-8 0xC2 0xA0 */
126 : {
127 1 : HtmlNode *r = html_parse(" ");
128 1 : ASSERT(r != NULL, "entity nbsp: root not NULL");
129 1 : HtmlNode *c = r->first_child;
130 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "entity nbsp: text node");
131 1 : ASSERT(c->text && (unsigned char)c->text[0] == 0xC2
132 : && (unsigned char)c->text[1] == 0xA0, "entity nbsp: UTF-8 correct");
133 1 : html_node_free(r);
134 : }
135 :
136 : /* 13. Numeric entity: A → 'A' */
137 : {
138 1 : HtmlNode *r = html_parse("A");
139 1 : ASSERT(r != NULL, "entity #65: root not NULL");
140 1 : HtmlNode *c = r->first_child;
141 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "entity #65: text node");
142 1 : ASSERT(c->text && c->text[0] == 'A', "entity #65: decoded to A");
143 1 : html_node_free(r);
144 : }
145 :
146 : /* 14. Hex entity: A → 'A' */
147 : {
148 1 : HtmlNode *r = html_parse("A");
149 1 : ASSERT(r != NULL, "entity #x41: root not NULL");
150 1 : HtmlNode *c = r->first_child;
151 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "entity #x41: text node");
152 1 : ASSERT(c->text && c->text[0] == 'A', "entity #x41: decoded to A");
153 1 : html_node_free(r);
154 : }
155 :
156 : /* 15. Comment ignored */
157 : {
158 1 : HtmlNode *r = html_parse("<!-- foo -->bar");
159 1 : ASSERT(r != NULL, "comment: root not NULL");
160 1 : HtmlNode *c = r->first_child;
161 1 : ASSERT(c != NULL, "comment: has child");
162 1 : ASSERT(c->type == HTML_NODE_TEXT, "comment: child is TEXT");
163 1 : ASSERT(strcmp(c->text, "bar") == 0, "comment: only text after comment");
164 1 : html_node_free(r);
165 : }
166 :
167 : /* 16. <script> body not in tree */
168 : {
169 1 : HtmlNode *r = html_parse("<script>alert(1)</script>after");
170 1 : ASSERT(r != NULL, "script: root not NULL");
171 1 : HtmlNode *c = r->first_child;
172 : /* Should be text "after", not the script content */
173 1 : ASSERT(c != NULL, "script: has child");
174 1 : ASSERT(c->type == HTML_NODE_TEXT, "script: child is TEXT");
175 1 : ASSERT(strcmp(c->text, "after") == 0, "script: only text after script");
176 1 : html_node_free(r);
177 : }
178 :
179 : /* 17. <style> body not in tree */
180 : {
181 1 : HtmlNode *r = html_parse("<style>body{color:red}</style>text");
182 1 : ASSERT(r != NULL, "style: root not NULL");
183 1 : HtmlNode *c = r->first_child;
184 1 : ASSERT(c != NULL, "style: has child");
185 1 : ASSERT(c->type == HTML_NODE_TEXT, "style: child is TEXT");
186 1 : ASSERT(strcmp(c->text, "text") == 0, "style: only text after style");
187 1 : html_node_free(r);
188 : }
189 :
190 : /* 18. Bad close tag — no crash, best-effort */
191 : {
192 1 : HtmlNode *r = html_parse("<b>bold</x>after");
193 1 : ASSERT(r != NULL, "bad close: root not NULL");
194 : /* Just ensure no crash and root is valid */
195 1 : html_node_free(r);
196 : }
197 :
198 : /* 19. Nested elements */
199 : {
200 1 : HtmlNode *r = html_parse("<div><p><b>x</b></p></div>");
201 1 : ASSERT(r != NULL, "nested: root not NULL");
202 1 : HtmlNode *div = r->first_child;
203 1 : ASSERT(div && strcmp(div->tag, "div") == 0, "nested: div");
204 1 : HtmlNode *p = div->first_child;
205 1 : ASSERT(p && strcmp(p->tag, "p") == 0, "nested: p inside div");
206 1 : HtmlNode *b = p->first_child;
207 1 : ASSERT(b && strcmp(b->tag, "b") == 0, "nested: b inside p");
208 1 : HtmlNode *txt = b->first_child;
209 1 : ASSERT(txt && txt->type == HTML_NODE_TEXT, "nested: text inside b");
210 1 : ASSERT(strcmp(txt->text, "x") == 0, "nested: text content");
211 1 : html_node_free(r);
212 : }
213 :
214 : /* 20. html_node_free(NULL) — no crash */
215 : {
216 1 : html_node_free(NULL); /* must not crash */
217 1 : ASSERT(1, "html_node_free(NULL): no crash");
218 : }
219 :
220 : /* 21. html_attr_get on NULL — no crash */
221 : {
222 1 : const char *v = html_attr_get(NULL, "foo");
223 1 : ASSERT(v == NULL, "html_attr_get(NULL): returns NULL");
224 : }
225 :
226 : /* 22. Numeric entity in BMP (3-byte UTF-8): • → U+2022 bullet */
227 : {
228 2 : RAII_HTML_NODE HtmlNode *r = html_parse("•");
229 1 : ASSERT(r != NULL, "entity bullet hex: root not NULL");
230 1 : HtmlNode *c = r->first_child;
231 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "entity bullet hex: text node");
232 : /* U+2022 = E2 80 A2 in UTF-8 */
233 1 : ASSERT(c->text &&
234 : (unsigned char)c->text[0] == 0xE2 &&
235 : (unsigned char)c->text[1] == 0x80 &&
236 : (unsigned char)c->text[2] == 0xA2,
237 : "entity bullet hex: 3-byte UTF-8");
238 : }
239 :
240 : /* 23. Hex entity with uppercase A-F:   → U+00A0 non-breaking space */
241 : {
242 2 : RAII_HTML_NODE HtmlNode *r = html_parse(" ");
243 1 : ASSERT(r != NULL, "entity nbsp hex upper: root not NULL");
244 1 : HtmlNode *c = r->first_child;
245 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "entity nbsp hex upper: text node");
246 1 : ASSERT(c->text &&
247 : (unsigned char)c->text[0] == 0xC2 &&
248 : (unsigned char)c->text[1] == 0xA0,
249 : "entity nbsp hex upper: UTF-8 correct");
250 : }
251 :
252 : /* 24. Hex entity with lowercase a-f:   → U+00A0 */
253 : {
254 2 : RAII_HTML_NODE HtmlNode *r = html_parse(" ");
255 1 : ASSERT(r != NULL, "entity nbsp hex lower: root not NULL");
256 1 : HtmlNode *c = r->first_child;
257 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "entity nbsp hex lower: text node");
258 1 : ASSERT(c->text &&
259 : (unsigned char)c->text[0] == 0xC2 &&
260 : (unsigned char)c->text[1] == 0xA0,
261 : "entity nbsp hex lower: UTF-8 correct");
262 : }
263 :
264 : /* 25. 4-byte UTF-8 entity: 😀 → U+1F600 emoji */
265 : {
266 2 : RAII_HTML_NODE HtmlNode *r = html_parse("😀");
267 1 : ASSERT(r != NULL, "entity emoji: root not NULL");
268 1 : HtmlNode *c = r->first_child;
269 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "entity emoji: text node");
270 : /* U+1F600 = F0 9F 98 80 in UTF-8 */
271 1 : ASSERT(c->text &&
272 : (unsigned char)c->text[0] == 0xF0 &&
273 : (unsigned char)c->text[1] == 0x9F,
274 : "entity emoji: 4-byte UTF-8");
275 : }
276 :
277 : /* 26. Unknown entity: &unknown; → copied verbatim */
278 : {
279 2 : RAII_HTML_NODE HtmlNode *r = html_parse("&unknown;");
280 1 : ASSERT(r != NULL, "entity unknown: root not NULL");
281 1 : HtmlNode *c = r->first_child;
282 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "entity unknown: text node");
283 1 : ASSERT(c->text && c->text[0] == '&', "entity unknown: & preserved");
284 : }
285 :
286 : /* 27. Auto-close list items: <li>one<li>two triggers stk_pop */
287 : {
288 2 : RAII_HTML_NODE HtmlNode *r = html_parse("<ul><li>one<li>two</ul>");
289 1 : ASSERT(r != NULL, "auto-close li: root not NULL");
290 1 : HtmlNode *ul = r->first_child;
291 1 : ASSERT(ul && strcmp(ul->tag, "ul") == 0, "auto-close li: ul present");
292 : /* Should have two li children */
293 1 : int count = 0;
294 1 : HtmlNode *ch = ul->first_child;
295 3 : while (ch) { if (strcmp(ch->tag, "li") == 0) count++; ch = ch->next_sibling; }
296 1 : ASSERT(count == 2, "auto-close li: two li children");
297 : }
298 :
299 : /* 28. <!DOCTYPE html> → PS_DECL, no tree node created */
300 : {
301 2 : RAII_HTML_NODE HtmlNode *r = html_parse("<!DOCTYPE html>text");
302 1 : ASSERT(r != NULL, "doctype: root not NULL");
303 1 : HtmlNode *c = r->first_child;
304 1 : ASSERT(c != NULL, "doctype: has child");
305 1 : ASSERT(c->type == HTML_NODE_TEXT, "doctype: child is text");
306 1 : ASSERT(strcmp(c->text, "text") == 0, "doctype: only text after DOCTYPE");
307 : }
308 :
309 : /* 29. <script> with no closing tag — no crash, skip to end */
310 : {
311 2 : RAII_HTML_NODE HtmlNode *r = html_parse("<script>orphan");
312 1 : ASSERT(r != NULL, "script no close: root not NULL");
313 : /* No children (script content skipped, nothing after) */
314 : }
315 :
316 : /* 30. <br /> (space before /) triggers PS_ATTR_SEP self-close */
317 : {
318 2 : RAII_HTML_NODE HtmlNode *r = html_parse("<br />");
319 1 : ASSERT(r != NULL, "br space slash: root not NULL");
320 1 : HtmlNode *br = r->first_child;
321 1 : ASSERT(br && strcmp(br->tag, "br") == 0, "br space slash: tag is br");
322 1 : ASSERT(br->first_child == NULL, "br space slash: no children");
323 : }
324 :
325 : /* 31. Boolean attr followed by another attr: <input disabled class="x"> */
326 : {
327 2 : RAII_HTML_NODE HtmlNode *r = html_parse("<input disabled class=\"x\">");
328 1 : ASSERT(r != NULL, "bool attr space: root not NULL");
329 1 : HtmlNode *inp = r->first_child;
330 1 : ASSERT(inp != NULL, "bool attr space: input present");
331 1 : const char *dis = html_attr_get(inp, "disabled");
332 1 : const char *cls = html_attr_get(inp, "class");
333 1 : ASSERT(dis != NULL, "bool attr space: disabled present");
334 1 : ASSERT(cls && strcmp(cls, "x") == 0, "bool attr space: class is x");
335 : }
336 :
337 : /* 32. Unquoted attr followed by space+attr: <input type=text class="x"> */
338 : {
339 2 : RAII_HTML_NODE HtmlNode *r = html_parse("<input type=text class=\"x\">");
340 1 : ASSERT(r != NULL, "unquoted+space attr: root not NULL");
341 1 : HtmlNode *inp = r->first_child;
342 1 : ASSERT(inp != NULL, "unquoted+space attr: input present");
343 1 : const char *type = html_attr_get(inp, "type");
344 1 : const char *cls = html_attr_get(inp, "class");
345 1 : ASSERT(type && strcmp(type, "text") == 0, "unquoted+space attr: type=text");
346 1 : ASSERT(cls && strcmp(cls, "x") == 0, "unquoted+space attr: class=x");
347 : }
348 :
349 : /* 33. Malformed tag: < followed by non-alpha treated as text */
350 : {
351 : /* Starting with < so the entire result is one TEXT node */
352 2 : RAII_HTML_NODE HtmlNode *r = html_parse("<3 love");
353 1 : ASSERT(r != NULL, "malformed lt: root not NULL");
354 1 : HtmlNode *c = r->first_child;
355 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "malformed lt: text node");
356 : /* Content should include the < treated as text */
357 1 : ASSERT(strstr(c->text, "<") != NULL, "malformed lt: < preserved");
358 : }
359 :
360 : /* 34. Comment with single dash in content: <!-- foo - bar --> */
361 : {
362 2 : RAII_HTML_NODE HtmlNode *r = html_parse("<!-- foo - bar -->baz");
363 1 : ASSERT(r != NULL, "comment single dash: root not NULL");
364 1 : HtmlNode *c = r->first_child;
365 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "comment single dash: text node");
366 1 : ASSERT(strcmp(c->text, "baz") == 0, "comment single dash: only text after");
367 : }
368 :
369 : /* 35. <!- (single dash, no comment) → PS_DECL */
370 : {
371 2 : RAII_HTML_NODE HtmlNode *r = html_parse("<!-not-a-comment>text");
372 1 : ASSERT(r != NULL, "bang single dash: root not NULL");
373 1 : HtmlNode *c = r->first_child;
374 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "bang single dash: text node");
375 1 : ASSERT(strcmp(c->text, "text") == 0, "bang single dash: only text after");
376 : }
377 :
378 : /* 36. <!-- --x --> double-dash then non-> in comment → PS_CMT resumes */
379 : {
380 2 : RAII_HTML_NODE HtmlNode *r = html_parse("<!-- --x -->end");
381 1 : ASSERT(r != NULL, "comment double dash non-close: root not NULL");
382 1 : HtmlNode *c = r->first_child;
383 1 : ASSERT(c && c->type == HTML_NODE_TEXT, "comment dd non-close: text node");
384 1 : ASSERT(strcmp(c->text, "end") == 0, "comment dd non-close: only text after");
385 : }
386 :
387 : /* 37. Attr name starting immediately after tag name (no space): PS_OPEN else branch */
388 : {
389 : /* "<div=text>" — 'd','i','v' collected in nb, then '=' is not alnum/-/_/:.
390 : * so done=1, cur=node_elem("div"), then c='=' → else branch line 342 */
391 2 : RAII_HTML_NODE HtmlNode *r = html_parse("<div=text>");
392 1 : ASSERT(r != NULL, "attr immediate: root not NULL");
393 : /* div element should exist as first child */
394 1 : HtmlNode *c = r->first_child;
395 1 : ASSERT(c != NULL, "attr immediate: has child");
396 1 : ASSERT(c->type == HTML_NODE_ELEMENT, "attr immediate: element node");
397 1 : ASSERT(strcmp(c->tag, "div") == 0, "attr immediate: tag is div");
398 : }
399 :
400 : /* 38. Codepoint > 0x10FFFF → cp_to_utf8 returns 0 (entity dropped) */
401 : {
402 : /* � is 0x200000 > 0x10FFFF → cp_to_utf8 returns 0, entity skipped */
403 2 : RAII_HTML_NODE HtmlNode *r = html_parse("�");
404 1 : ASSERT(r != NULL, "cp>0x10FFFF: root not NULL");
405 : /* Entity is dropped: text node has empty text or no children */
406 1 : HtmlNode *c = r->first_child;
407 : /* Either no child (empty text) or text is empty string */
408 1 : if (c) {
409 1 : ASSERT(c->type == HTML_NODE_TEXT, "cp>0x10FFFF: text node type");
410 1 : ASSERT(c->text != NULL && c->text[0] == '\0',
411 : "cp>0x10FFFF: entity dropped → empty text");
412 : }
413 : }
414 : }
|