LCOV - code coverage report
Current view: top level - tests/unit - test_html_parser.c (source / functions) Coverage Total Hit
Test: coverage.info Lines: 100.0 % 225 225
Test Date: 2026-04-15 21:12:52 Functions: 100.0 % 1 1

            Line data    Source code
       1              : #include "test_helpers.h"
       2              : #include "html_parser.h"
       3              : #include "raii.h"
       4              : #include <string.h>
       5              : #include <stdlib.h>
       6              : 
       7            1 : void test_html_parser(void) {
       8              : 
       9              :     /* 1. NULL input → NULL */
      10              :     {
      11            1 :         HtmlNode *r = html_parse(NULL);
      12            1 :         ASSERT(r == NULL, "html_parse(NULL) should return NULL");
      13              :     }
      14              : 
      15              :     /* 2. Empty string → root with no children */
      16              :     {
      17            1 :         HtmlNode *r = html_parse("");
      18            1 :         ASSERT(r != NULL, "html_parse empty: root not NULL");
      19            1 :         ASSERT(r->first_child == NULL, "html_parse empty: no children");
      20            1 :         html_node_free(r);
      21              :     }
      22              : 
      23              :     /* 3. Plain text → single TEXT child */
      24              :     {
      25            1 :         HtmlNode *r = html_parse("Hello World");
      26            1 :         ASSERT(r != NULL, "plain text: root not NULL");
      27            1 :         HtmlNode *c = r->first_child;
      28            1 :         ASSERT(c != NULL, "plain text: has child");
      29            1 :         ASSERT(c->type == HTML_NODE_TEXT, "plain text: child is TEXT");
      30            1 :         ASSERT(strcmp(c->text, "Hello World") == 0, "plain text: content matches");
      31            1 :         html_node_free(r);
      32              :     }
      33              : 
      34              :     /* 4. <b>text</b> → ELEMENT(b) → TEXT child */
      35              :     {
      36            1 :         HtmlNode *r = html_parse("<b>text</b>");
      37            1 :         ASSERT(r != NULL, "b: root not NULL");
      38            1 :         HtmlNode *b = r->first_child;
      39            1 :         ASSERT(b != NULL, "b: element present");
      40            1 :         ASSERT(b->type == HTML_NODE_ELEMENT, "b: type ELEMENT");
      41            1 :         ASSERT(strcmp(b->tag, "b") == 0, "b: tag is 'b'");
      42            1 :         HtmlNode *txt = b->first_child;
      43            1 :         ASSERT(txt != NULL, "b: has text child");
      44            1 :         ASSERT(txt->type == HTML_NODE_TEXT, "b: child is TEXT");
      45            1 :         ASSERT(strcmp(txt->text, "text") == 0, "b: text content matches");
      46            1 :         html_node_free(r);
      47              :     }
      48              : 
      49              :     /* 5. Void element <br> — no children */
      50              :     {
      51            1 :         HtmlNode *r = html_parse("<br>");
      52            1 :         ASSERT(r != NULL, "br: root not NULL");
      53            1 :         HtmlNode *br = r->first_child;
      54            1 :         ASSERT(br != NULL, "br: element present");
      55            1 :         ASSERT(strcmp(br->tag, "br") == 0, "br: tag is 'br'");
      56            1 :         ASSERT(br->first_child == NULL, "br: no children");
      57            1 :         html_node_free(r);
      58              :     }
      59              : 
      60              :     /* 6. Self-closing <br/> — no children */
      61              :     {
      62            1 :         HtmlNode *r = html_parse("<br/>");
      63            1 :         ASSERT(r != NULL, "br/: root not NULL");
      64            1 :         HtmlNode *br = r->first_child;
      65            1 :         ASSERT(br != NULL, "br/: element present");
      66            1 :         ASSERT(br->first_child == NULL, "br/: no children");
      67            1 :         html_node_free(r);
      68              :     }
      69              : 
      70              :     /* 7. Attributes: double-quoted */
      71              :     {
      72            1 :         HtmlNode *r = html_parse("<img src=\"foo.png\" alt=\"bar\">");
      73            1 :         ASSERT(r != NULL, "attrs: root not NULL");
      74            1 :         HtmlNode *img = r->first_child;
      75            1 :         ASSERT(img != NULL, "attrs: img present");
      76            1 :         const char *src = html_attr_get(img, "src");
      77            1 :         const char *alt = html_attr_get(img, "alt");
      78            1 :         ASSERT(src && strcmp(src, "foo.png") == 0, "attrs: src matches");
      79            1 :         ASSERT(alt && strcmp(alt, "bar") == 0, "attrs: alt matches");
      80            1 :         html_node_free(r);
      81              :     }
      82              : 
      83              :     /* 8. Attributes: unquoted */
      84              :     {
      85            1 :         HtmlNode *r = html_parse("<input type=text>");
      86            1 :         ASSERT(r != NULL, "unquoted attr: root not NULL");
      87            1 :         HtmlNode *inp = r->first_child;
      88            1 :         ASSERT(inp != NULL, "unquoted attr: input present");
      89            1 :         const char *type = html_attr_get(inp, "type");
      90            1 :         ASSERT(type && strcmp(type, "text") == 0, "unquoted attr: type matches");
      91            1 :         html_node_free(r);
      92              :     }
      93              : 
      94              :     /* 9. Boolean attribute */
      95              :     {
      96            1 :         HtmlNode *r = html_parse("<input disabled>");
      97            1 :         ASSERT(r != NULL, "bool attr: root not NULL");
      98            1 :         HtmlNode *inp = r->first_child;
      99            1 :         ASSERT(inp != NULL, "bool attr: input present");
     100            1 :         const char *dis = html_attr_get(inp, "disabled");
     101            1 :         ASSERT(dis != NULL, "bool attr: disabled present");
     102            1 :         html_node_free(r);
     103              :     }
     104              : 
     105              :     /* 10. Entity: &amp; → & */
     106              :     {
     107            1 :         HtmlNode *r = html_parse("a &amp; b");
     108            1 :         ASSERT(r != NULL, "entity amp: root not NULL");
     109            1 :         HtmlNode *c = r->first_child;
     110            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "entity amp: text node");
     111            1 :         ASSERT(strcmp(c->text, "a & b") == 0, "entity amp: decoded");
     112            1 :         html_node_free(r);
     113              :     }
     114              : 
     115              :     /* 11. Entity: &lt; → < */
     116              :     {
     117            1 :         HtmlNode *r = html_parse("&lt;tag&gt;");
     118            1 :         ASSERT(r != NULL, "entity lt gt: root not NULL");
     119            1 :         HtmlNode *c = r->first_child;
     120            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "entity lt gt: text node");
     121            1 :         ASSERT(strcmp(c->text, "<tag>") == 0, "entity lt gt: decoded");
     122            1 :         html_node_free(r);
     123              :     }
     124              : 
     125              :     /* 12. Entity: &nbsp; → UTF-8 0xC2 0xA0 */
     126              :     {
     127            1 :         HtmlNode *r = html_parse("&nbsp;");
     128            1 :         ASSERT(r != NULL, "entity nbsp: root not NULL");
     129            1 :         HtmlNode *c = r->first_child;
     130            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "entity nbsp: text node");
     131            1 :         ASSERT(c->text && (unsigned char)c->text[0] == 0xC2
     132              :                && (unsigned char)c->text[1] == 0xA0, "entity nbsp: UTF-8 correct");
     133            1 :         html_node_free(r);
     134              :     }
     135              : 
     136              :     /* 13. Numeric entity: &#65; → 'A' */
     137              :     {
     138            1 :         HtmlNode *r = html_parse("&#65;");
     139            1 :         ASSERT(r != NULL, "entity #65: root not NULL");
     140            1 :         HtmlNode *c = r->first_child;
     141            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "entity #65: text node");
     142            1 :         ASSERT(c->text && c->text[0] == 'A', "entity #65: decoded to A");
     143            1 :         html_node_free(r);
     144              :     }
     145              : 
     146              :     /* 14. Hex entity: &#x41; → 'A' */
     147              :     {
     148            1 :         HtmlNode *r = html_parse("&#x41;");
     149            1 :         ASSERT(r != NULL, "entity #x41: root not NULL");
     150            1 :         HtmlNode *c = r->first_child;
     151            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "entity #x41: text node");
     152            1 :         ASSERT(c->text && c->text[0] == 'A', "entity #x41: decoded to A");
     153            1 :         html_node_free(r);
     154              :     }
     155              : 
     156              :     /* 15. Comment ignored */
     157              :     {
     158            1 :         HtmlNode *r = html_parse("<!-- foo -->bar");
     159            1 :         ASSERT(r != NULL, "comment: root not NULL");
     160            1 :         HtmlNode *c = r->first_child;
     161            1 :         ASSERT(c != NULL, "comment: has child");
     162            1 :         ASSERT(c->type == HTML_NODE_TEXT, "comment: child is TEXT");
     163            1 :         ASSERT(strcmp(c->text, "bar") == 0, "comment: only text after comment");
     164            1 :         html_node_free(r);
     165              :     }
     166              : 
     167              :     /* 16. <script> body not in tree */
     168              :     {
     169            1 :         HtmlNode *r = html_parse("<script>alert(1)</script>after");
     170            1 :         ASSERT(r != NULL, "script: root not NULL");
     171            1 :         HtmlNode *c = r->first_child;
     172              :         /* Should be text "after", not the script content */
     173            1 :         ASSERT(c != NULL, "script: has child");
     174            1 :         ASSERT(c->type == HTML_NODE_TEXT, "script: child is TEXT");
     175            1 :         ASSERT(strcmp(c->text, "after") == 0, "script: only text after script");
     176            1 :         html_node_free(r);
     177              :     }
     178              : 
     179              :     /* 17. <style> body not in tree */
     180              :     {
     181            1 :         HtmlNode *r = html_parse("<style>body{color:red}</style>text");
     182            1 :         ASSERT(r != NULL, "style: root not NULL");
     183            1 :         HtmlNode *c = r->first_child;
     184            1 :         ASSERT(c != NULL, "style: has child");
     185            1 :         ASSERT(c->type == HTML_NODE_TEXT, "style: child is TEXT");
     186            1 :         ASSERT(strcmp(c->text, "text") == 0, "style: only text after style");
     187            1 :         html_node_free(r);
     188              :     }
     189              : 
     190              :     /* 18. Bad close tag — no crash, best-effort */
     191              :     {
     192            1 :         HtmlNode *r = html_parse("<b>bold</x>after");
     193            1 :         ASSERT(r != NULL, "bad close: root not NULL");
     194              :         /* Just ensure no crash and root is valid */
     195            1 :         html_node_free(r);
     196              :     }
     197              : 
     198              :     /* 19. Nested elements */
     199              :     {
     200            1 :         HtmlNode *r = html_parse("<div><p><b>x</b></p></div>");
     201            1 :         ASSERT(r != NULL, "nested: root not NULL");
     202            1 :         HtmlNode *div = r->first_child;
     203            1 :         ASSERT(div && strcmp(div->tag, "div") == 0, "nested: div");
     204            1 :         HtmlNode *p = div->first_child;
     205            1 :         ASSERT(p && strcmp(p->tag, "p") == 0, "nested: p inside div");
     206            1 :         HtmlNode *b = p->first_child;
     207            1 :         ASSERT(b && strcmp(b->tag, "b") == 0, "nested: b inside p");
     208            1 :         HtmlNode *txt = b->first_child;
     209            1 :         ASSERT(txt && txt->type == HTML_NODE_TEXT, "nested: text inside b");
     210            1 :         ASSERT(strcmp(txt->text, "x") == 0, "nested: text content");
     211            1 :         html_node_free(r);
     212              :     }
     213              : 
     214              :     /* 20. html_node_free(NULL) — no crash */
     215              :     {
     216            1 :         html_node_free(NULL); /* must not crash */
     217            1 :         ASSERT(1, "html_node_free(NULL): no crash");
     218              :     }
     219              : 
     220              :     /* 21. html_attr_get on NULL — no crash */
     221              :     {
     222            1 :         const char *v = html_attr_get(NULL, "foo");
     223            1 :         ASSERT(v == NULL, "html_attr_get(NULL): returns NULL");
     224              :     }
     225              : 
     226              :     /* 22. Numeric entity in BMP (3-byte UTF-8): &#x2022; → U+2022 bullet */
     227              :     {
     228            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("&#x2022;");
     229            1 :         ASSERT(r != NULL, "entity bullet hex: root not NULL");
     230            1 :         HtmlNode *c = r->first_child;
     231            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "entity bullet hex: text node");
     232              :         /* U+2022 = E2 80 A2 in UTF-8 */
     233            1 :         ASSERT(c->text &&
     234              :                (unsigned char)c->text[0] == 0xE2 &&
     235              :                (unsigned char)c->text[1] == 0x80 &&
     236              :                (unsigned char)c->text[2] == 0xA2,
     237              :                "entity bullet hex: 3-byte UTF-8");
     238              :     }
     239              : 
     240              :     /* 23. Hex entity with uppercase A-F: &#xA0; → U+00A0 non-breaking space */
     241              :     {
     242            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("&#xA0;");
     243            1 :         ASSERT(r != NULL, "entity nbsp hex upper: root not NULL");
     244            1 :         HtmlNode *c = r->first_child;
     245            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "entity nbsp hex upper: text node");
     246            1 :         ASSERT(c->text &&
     247              :                (unsigned char)c->text[0] == 0xC2 &&
     248              :                (unsigned char)c->text[1] == 0xA0,
     249              :                "entity nbsp hex upper: UTF-8 correct");
     250              :     }
     251              : 
     252              :     /* 24. Hex entity with lowercase a-f: &#xa0; → U+00A0 */
     253              :     {
     254            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("&#xa0;");
     255            1 :         ASSERT(r != NULL, "entity nbsp hex lower: root not NULL");
     256            1 :         HtmlNode *c = r->first_child;
     257            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "entity nbsp hex lower: text node");
     258            1 :         ASSERT(c->text &&
     259              :                (unsigned char)c->text[0] == 0xC2 &&
     260              :                (unsigned char)c->text[1] == 0xA0,
     261              :                "entity nbsp hex lower: UTF-8 correct");
     262              :     }
     263              : 
     264              :     /* 25. 4-byte UTF-8 entity: &#x1F600; → U+1F600 emoji */
     265              :     {
     266            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("&#x1F600;");
     267            1 :         ASSERT(r != NULL, "entity emoji: root not NULL");
     268            1 :         HtmlNode *c = r->first_child;
     269            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "entity emoji: text node");
     270              :         /* U+1F600 = F0 9F 98 80 in UTF-8 */
     271            1 :         ASSERT(c->text &&
     272              :                (unsigned char)c->text[0] == 0xF0 &&
     273              :                (unsigned char)c->text[1] == 0x9F,
     274              :                "entity emoji: 4-byte UTF-8");
     275              :     }
     276              : 
     277              :     /* 26. Unknown entity: &unknown; → copied verbatim */
     278              :     {
     279            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("&unknown;");
     280            1 :         ASSERT(r != NULL, "entity unknown: root not NULL");
     281            1 :         HtmlNode *c = r->first_child;
     282            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "entity unknown: text node");
     283            1 :         ASSERT(c->text && c->text[0] == '&', "entity unknown: & preserved");
     284              :     }
     285              : 
     286              :     /* 27. Auto-close list items: <li>one<li>two triggers stk_pop */
     287              :     {
     288            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("<ul><li>one<li>two</ul>");
     289            1 :         ASSERT(r != NULL, "auto-close li: root not NULL");
     290            1 :         HtmlNode *ul = r->first_child;
     291            1 :         ASSERT(ul && strcmp(ul->tag, "ul") == 0, "auto-close li: ul present");
     292              :         /* Should have two li children */
     293            1 :         int count = 0;
     294            1 :         HtmlNode *ch = ul->first_child;
     295            3 :         while (ch) { if (strcmp(ch->tag, "li") == 0) count++; ch = ch->next_sibling; }
     296            1 :         ASSERT(count == 2, "auto-close li: two li children");
     297              :     }
     298              : 
     299              :     /* 28. <!DOCTYPE html> → PS_DECL, no tree node created */
     300              :     {
     301            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("<!DOCTYPE html>text");
     302            1 :         ASSERT(r != NULL, "doctype: root not NULL");
     303            1 :         HtmlNode *c = r->first_child;
     304            1 :         ASSERT(c != NULL, "doctype: has child");
     305            1 :         ASSERT(c->type == HTML_NODE_TEXT, "doctype: child is text");
     306            1 :         ASSERT(strcmp(c->text, "text") == 0, "doctype: only text after DOCTYPE");
     307              :     }
     308              : 
     309              :     /* 29. <script> with no closing tag — no crash, skip to end */
     310              :     {
     311            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("<script>orphan");
     312            1 :         ASSERT(r != NULL, "script no close: root not NULL");
     313              :         /* No children (script content skipped, nothing after) */
     314              :     }
     315              : 
     316              :     /* 30. <br /> (space before /) triggers PS_ATTR_SEP self-close */
     317              :     {
     318            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("<br />");
     319            1 :         ASSERT(r != NULL, "br space slash: root not NULL");
     320            1 :         HtmlNode *br = r->first_child;
     321            1 :         ASSERT(br && strcmp(br->tag, "br") == 0, "br space slash: tag is br");
     322            1 :         ASSERT(br->first_child == NULL, "br space slash: no children");
     323              :     }
     324              : 
     325              :     /* 31. Boolean attr followed by another attr: <input disabled class="x"> */
     326              :     {
     327            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("<input disabled class=\"x\">");
     328            1 :         ASSERT(r != NULL, "bool attr space: root not NULL");
     329            1 :         HtmlNode *inp = r->first_child;
     330            1 :         ASSERT(inp != NULL, "bool attr space: input present");
     331            1 :         const char *dis = html_attr_get(inp, "disabled");
     332            1 :         const char *cls = html_attr_get(inp, "class");
     333            1 :         ASSERT(dis != NULL, "bool attr space: disabled present");
     334            1 :         ASSERT(cls && strcmp(cls, "x") == 0, "bool attr space: class is x");
     335              :     }
     336              : 
     337              :     /* 32. Unquoted attr followed by space+attr: <input type=text class="x"> */
     338              :     {
     339            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("<input type=text class=\"x\">");
     340            1 :         ASSERT(r != NULL, "unquoted+space attr: root not NULL");
     341            1 :         HtmlNode *inp = r->first_child;
     342            1 :         ASSERT(inp != NULL, "unquoted+space attr: input present");
     343            1 :         const char *type = html_attr_get(inp, "type");
     344            1 :         const char *cls  = html_attr_get(inp, "class");
     345            1 :         ASSERT(type && strcmp(type, "text") == 0, "unquoted+space attr: type=text");
     346            1 :         ASSERT(cls  && strcmp(cls,  "x")    == 0, "unquoted+space attr: class=x");
     347              :     }
     348              : 
     349              :     /* 33. Malformed tag: < followed by non-alpha treated as text */
     350              :     {
     351              :         /* Starting with < so the entire result is one TEXT node */
     352            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("<3 love");
     353            1 :         ASSERT(r != NULL, "malformed lt: root not NULL");
     354            1 :         HtmlNode *c = r->first_child;
     355            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "malformed lt: text node");
     356              :         /* Content should include the < treated as text */
     357            1 :         ASSERT(strstr(c->text, "<") != NULL, "malformed lt: < preserved");
     358              :     }
     359              : 
     360              :     /* 34. Comment with single dash in content: <!-- foo - bar --> */
     361              :     {
     362            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("<!-- foo - bar -->baz");
     363            1 :         ASSERT(r != NULL, "comment single dash: root not NULL");
     364            1 :         HtmlNode *c = r->first_child;
     365            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "comment single dash: text node");
     366            1 :         ASSERT(strcmp(c->text, "baz") == 0, "comment single dash: only text after");
     367              :     }
     368              : 
     369              :     /* 35. <!- (single dash, no comment) → PS_DECL */
     370              :     {
     371            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("<!-not-a-comment>text");
     372            1 :         ASSERT(r != NULL, "bang single dash: root not NULL");
     373            1 :         HtmlNode *c = r->first_child;
     374            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "bang single dash: text node");
     375            1 :         ASSERT(strcmp(c->text, "text") == 0, "bang single dash: only text after");
     376              :     }
     377              : 
     378              :     /* 36. <!-- --x --> double-dash then non-> in comment → PS_CMT resumes */
     379              :     {
     380            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("<!-- --x -->end");
     381            1 :         ASSERT(r != NULL, "comment double dash non-close: root not NULL");
     382            1 :         HtmlNode *c = r->first_child;
     383            1 :         ASSERT(c && c->type == HTML_NODE_TEXT, "comment dd non-close: text node");
     384            1 :         ASSERT(strcmp(c->text, "end") == 0, "comment dd non-close: only text after");
     385              :     }
     386              : 
     387              :     /* 37. Attr name starting immediately after tag name (no space): PS_OPEN else branch */
     388              :     {
     389              :         /* "<div=text>" — 'd','i','v' collected in nb, then '=' is not alnum/-/_/:.
     390              :          * so done=1, cur=node_elem("div"), then c='=' → else branch line 342 */
     391            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("<div=text>");
     392            1 :         ASSERT(r != NULL, "attr immediate: root not NULL");
     393              :         /* div element should exist as first child */
     394            1 :         HtmlNode *c = r->first_child;
     395            1 :         ASSERT(c != NULL, "attr immediate: has child");
     396            1 :         ASSERT(c->type == HTML_NODE_ELEMENT, "attr immediate: element node");
     397            1 :         ASSERT(strcmp(c->tag, "div") == 0, "attr immediate: tag is div");
     398              :     }
     399              : 
     400              :     /* 38. Codepoint > 0x10FFFF → cp_to_utf8 returns 0 (entity dropped) */
     401              :     {
     402              :         /* &#x200000; is 0x200000 > 0x10FFFF → cp_to_utf8 returns 0, entity skipped */
     403            2 :         RAII_HTML_NODE HtmlNode *r = html_parse("&#x200000;");
     404            1 :         ASSERT(r != NULL, "cp>0x10FFFF: root not NULL");
     405              :         /* Entity is dropped: text node has empty text or no children */
     406            1 :         HtmlNode *c = r->first_child;
     407              :         /* Either no child (empty text) or text is empty string */
     408            1 :         if (c) {
     409            1 :             ASSERT(c->type == HTML_NODE_TEXT, "cp>0x10FFFF: text node type");
     410            1 :             ASSERT(c->text != NULL && c->text[0] == '\0',
     411              :                    "cp>0x10FFFF: entity dropped → empty text");
     412              :         }
     413              :     }
     414              : }
        

Generated by: LCOV version 2.0-1