LCOV - code coverage report
Current view: top level - libemail/src/core - html_parser.c (source / functions) Coverage Total Hit
Test: coverage.info Lines: 98.1 % 267 262
Test Date: 2026-04-15 21:12:52 Functions: 100.0 % 22 22

            Line data    Source code
       1              : #include "html_parser.h"
       2              : #include <stdio.h>
       3              : #include <stdlib.h>
       4              : #include <string.h>
       5              : #include <ctype.h>
       6              : 
       7              : /* ── Void elements (never have children) ───────────────────────────── */
       8              : 
       9              : static const char * const VOID_TAGS[] = {
      10              :     "area","base","br","col","embed","hr","img","input",
      11              :     "link","meta","param","source","track","wbr", NULL
      12              : };
      13          317 : static int is_void(const char *t) {
      14          317 :     if (!t) return 0;
      15         4255 :     for (int i = 0; VOID_TAGS[i]; i++)
      16         3984 :         if (strcmp(t, VOID_TAGS[i]) == 0) return 1;
      17          271 :     return 0;
      18              : }
      19              : 
      20              : /* Auto-close: opening one of these implicitly closes the previous sibling */
      21              : static const char * const AUTO_CLOSE_TAGS[] = {
      22              :     "li","p","dt","dd","tr","td","th", NULL
      23              : };
      24          319 : static int is_auto_close(const char *t) {
      25          319 :     if (!t) return 0;
      26         2372 :     for (int i = 0; AUTO_CLOSE_TAGS[i]; i++)
      27         2096 :         if (strcmp(t, AUTO_CLOSE_TAGS[i]) == 0) return 1;
      28          276 :     return 0;
      29              : }
      30              : 
      31              : /* ── Dynamic string ─────────────────────────────────────────────────── */
      32              : 
      33              : typedef struct { char *d; size_t n, cap; } Buf;
      34              : 
      35         5851 : static int buf_push(Buf *b, char c) {
      36         5851 :     if (b->n + 1 >= b->cap) {
      37         1062 :         size_t nc = b->cap ? b->cap * 2 : 64;
      38         1062 :         char *t = realloc(b->d, nc);
      39         1062 :         if (!t) return 0;
      40         1062 :         b->d = t; b->cap = nc;
      41              :     }
      42         5851 :     b->d[b->n++] = c;
      43         5851 :     b->d[b->n]   = '\0';
      44         5851 :     return 1;
      45              : }
      46          556 : static char *buf_take(Buf *b) {
      47          556 :     char *r = b->d; b->d = NULL; b->n = b->cap = 0; return r;
      48              : }
      49         1341 : static void buf_free(Buf *b) { free(b->d); b->d = NULL; b->n = b->cap = 0; }
      50              : 
      51              : /* ── Node helpers ───────────────────────────────────────────────────── */
      52              : 
      53          521 : static HtmlNode *node_elem(char *tag) {
      54          521 :     HtmlNode *n = calloc(1, sizeof(*n));
      55          521 :     if (!n) { free(tag); return NULL; }
      56          521 :     n->type = HTML_NODE_ELEMENT; n->tag = tag; return n;
      57              : }
      58          298 : static HtmlNode *node_text(char *text) {
      59          298 :     HtmlNode *n = calloc(1, sizeof(*n));
      60          298 :     if (!n) { free(text); return NULL; }
      61          298 :     n->type = HTML_NODE_TEXT; n->text = text; return n;
      62              : }
      63          819 : static void attr_free_list(HtmlAttr *a) {
      64          939 :     while (a) { HtmlAttr *nx = a->next; free(a->name); free(a->value); free(a); a = nx; }
      65          819 : }
      66         1841 : void html_node_free(HtmlNode *node) {
      67         1841 :     if (!node) return;
      68          819 :     html_node_free(node->first_child);
      69          819 :     html_node_free(node->next_sibling);
      70          819 :     attr_free_list(node->attrs);
      71          819 :     free(node->tag); free(node->text); free(node);
      72              : }
      73          329 : const char *html_attr_get(const HtmlNode *node, const char *name) {
      74          329 :     if (!node || !name) return NULL;
      75          349 :     for (HtmlAttr *a = node->attrs; a; a = a->next)
      76          140 :         if (strcmp(a->name, name) == 0) return a->value;
      77          209 :     return NULL;
      78              : }
      79          617 : static void node_add_child(HtmlNode *parent, HtmlNode *child) {
      80          617 :     if (!parent || !child) return;
      81          617 :     child->parent = parent;
      82          617 :     if (!parent->first_child) { parent->first_child = child; return; }
      83          160 :     HtmlNode *last = parent->first_child;
      84          657 :     while (last->next_sibling) last = last->next_sibling;
      85          160 :     last->next_sibling = child;
      86              : }
      87          120 : static void attr_prepend(HtmlNode *elem, char *name, char *value) {
      88          120 :     HtmlAttr *a = calloc(1, sizeof(*a));
      89          120 :     if (!a) { free(name); free(value); return; }
      90          120 :     a->name = name; a->value = value; a->next = elem->attrs; elem->attrs = a;
      91              : }
      92              : 
      93              : /* ── Entity decoder ─────────────────────────────────────────────────── */
      94              : 
      95              : static const struct { const char *name; unsigned cp; } ENTITIES[] = {
      96              :     {"amp",'&'},{"lt",'<'},{"gt",'>'},{"quot",'"'},{"apos",'\''},
      97              :     {"nbsp",0xA0},{"copy",0xA9},{"reg",0xAE},{"mdash",0x2014},
      98              :     {"ndash",0x2013},{"hellip",0x2026},{"laquo",0xAB},{"raquo",0xBB},
      99              :     /* zero-width / formatting */
     100              :     {"zwnj",0x200C},{"zwj",0x200D},{"shy",0xAD},
     101              :     /* common symbols */
     102              :     {"bull",0x2022},{"middot",0xB7},{"trade",0x2122},{"euro",0x20AC},
     103              :     {"pound",0xA3},{"cent",0xA2},{"yen",0xA5},
     104              :     {"times",0xD7},{"divide",0xF7},
     105              :     {NULL,0}
     106              : };
     107           13 : static int cp_to_utf8(unsigned cp, char *out) {
     108           13 :     if (cp < 0x80)  { out[0]=(char)cp; return 1; }
     109            7 :     if (cp < 0x800) { out[0]=(char)(0xC0|(cp>>6)); out[1]=(char)(0x80|(cp&0x3F)); return 2; }
     110            4 :     if (cp < 0x10000) {
     111            2 :         out[0]=(char)(0xE0|(cp>>12)); out[1]=(char)(0x80|((cp>>6)&0x3F));
     112            2 :         out[2]=(char)(0x80|(cp&0x3F)); return 3;
     113              :     }
     114            2 :     if (cp <= 0x10FFFF) {
     115            1 :         out[0]=(char)(0xF0|(cp>>18)); out[1]=(char)(0x80|((cp>>12)&0x3F));
     116            1 :         out[2]=(char)(0x80|((cp>>6)&0x3F)); out[3]=(char)(0x80|(cp&0x3F)); return 4;
     117              :     }
     118            1 :     return 0;
     119              : }
     120          298 : static char *decode_entities(const char *in) {
     121          298 :     if (!in) return NULL;
     122          298 :     size_t len = strlen(in);
     123          298 :     char *out = malloc(len * 4 + 1);
     124          298 :     if (!out) return NULL;
     125          298 :     size_t n = 0;
     126         1941 :     for (size_t i = 0; i < len; ) {
     127         1643 :         if (in[i] != '&') { out[n++] = in[i++]; continue; }
     128              :         /* find ';' within reasonable range */
     129           14 :         size_t j = i + 1;
     130           75 :         while (j < len && in[j] != ';' && j - i < 14) j++;
     131           14 :         if (j < len && in[j] == ';' && j > i + 1) {
     132           14 :             const char *e = in + i + 1;
     133           14 :             size_t el = j - i - 1;
     134           14 :             unsigned cp = 0; int matched = 0;
     135           14 :             if (el > 0 && e[0] == '#') {
     136            7 :                 if (el > 1 && (e[1]=='x'||e[1]=='X')) {
     137           27 :                     for (size_t k=2;k<el;k++) {
     138           21 :                         unsigned char c=(unsigned char)e[k];
     139           21 :                         if (c>='0'&&c<='9') cp=cp*16+(c-'0');
     140            3 :                         else if (c>='a'&&c<='f') cp=cp*16+(c-'a'+10);
     141            2 :                         else if (c>='A'&&c<='F') cp=cp*16+(c-'A'+10);
     142              :                     }
     143              :                 } else {
     144            3 :                     for (size_t k=1;k<el;k++)
     145            2 :                         if (e[k]>='0'&&e[k]<='9') cp=cp*10+(unsigned char)(e[k]-'0');
     146              :                 }
     147            7 :                 matched = (cp > 0);
     148            7 :             } else if (el < 10) {
     149            7 :                 char nm[10]={0}; memcpy(nm,e,el);
     150           53 :                 for (int k=0; ENTITIES[k].name; k++)
     151           52 :                     if (strcmp(ENTITIES[k].name,nm)==0) { cp=ENTITIES[k].cp; matched=1; break; }
     152              :             }
     153           14 :             if (matched) {
     154           13 :                 char utf8[5]={0}; int nb=cp_to_utf8(cp,utf8);
     155           35 :                 for (int k=0;k<nb;k++) out[n++]=utf8[k];
     156           13 :                 i = j + 1; continue;
     157              :             }
     158              :         }
     159            1 :         out[n++] = in[i++];
     160              :     }
     161          298 :     out[n] = '\0';
     162          298 :     return out;
     163              : }
     164              : 
     165              : /* ── Open-element stack ─────────────────────────────────────────────── */
     166              : 
     167              : #define STACK_MAX 256
     168              : typedef struct { HtmlNode *n[STACK_MAX]; int top; } Stack;
     169          473 : static void stk_push(Stack *s, HtmlNode *n) { if (s->top<STACK_MAX) s->n[s->top++]=n; }
     170          660 : static HtmlNode *stk_top(Stack *s) { return s->top>0 ? s->n[s->top-1] : NULL; }
     171            3 : static void stk_pop(Stack *s) { if (s->top>0) s->top--; }
     172              : /* Pop up to and including the nearest matching tag; return 1 if found */
     173          195 : static int stk_close(Stack *s, const char *tag) {
     174          239 :     for (int i=s->top-1; i>=0; i--)
     175          238 :         if (s->n[i]->tag && strcmp(s->n[i]->tag,tag)==0) { s->top=i; return 1; }
     176            1 :     return 0;
     177              : }
     178              : 
     179              : /* ── Parser ─────────────────────────────────────────────────────────── */
     180              : 
     181              : typedef enum {
     182              :     PS_TEXT,
     183              :     PS_LT,       /* saw '<' */
     184              :     PS_BANG,     /* saw '<!' */
     185              :     PS_CMT,      /* inside <!-- --> */
     186              :     PS_CMT_D1,   /* saw '-' inside comment */
     187              :     PS_CMT_DD,   /* saw '--' inside comment */
     188              :     PS_DECL,     /* inside <!DOCTYPE ...> */
     189              :     PS_OPEN,     /* reading open tag name */
     190              :     PS_CLOSE,    /* reading close tag name */
     191              :     PS_ATTR_SEP, /* between attributes */
     192              :     PS_ATTR_NM,  /* reading attribute name */
     193              :     PS_ATTR_EQ,  /* saw '=' */
     194              :     PS_ATTR_VQ,  /* in quoted attribute value */
     195              :     PS_ATTR_VU,  /* in unquoted attribute value */
     196              : } PState;
     197              : 
     198          524 : static void tolower_buf(char *s) {
     199         2015 :     for (; s && *s; s++) *s = (char)tolower((unsigned char)*s);
     200          524 : }
     201              : 
     202              : /* Flush text buffer as a text node child of current stack top */
     203          732 : static void flush_text(Stack *stk, Buf *tb) {
     204          732 :     if (!tb->d || !tb->n) { buf_free(tb); return; }
     205          298 :     char *decoded = decode_entities(tb->d);
     206          298 :     HtmlNode *tn = node_text(decoded ? decoded : strdup(tb->d));
     207          298 :     if (!decoded) free(tb->d);
     208          298 :     buf_free(tb);
     209          298 :     if (tn) node_add_child(stk_top(stk), tn);
     210              : }
     211              : 
     212              : /* After completing a tag build: add to tree, push if non-void */
     213          319 : static void commit_elem(Stack *stk, HtmlNode *elem, int self_close) {
     214          319 :     if (!elem) return;
     215          319 :     if (is_auto_close(elem->tag)) {
     216           43 :         HtmlNode *top = stk_top(stk);
     217           43 :         if (top && top->tag && strcmp(top->tag, elem->tag) == 0)
     218            3 :             stk_pop(stk);
     219              :     }
     220          319 :     node_add_child(stk_top(stk), elem);
     221          319 :     if (!self_close && !is_void(elem->tag))
     222          271 :         stk_push(stk, elem);
     223              : }
     224              : 
     225          203 : HtmlNode *html_parse(const char *html) {
     226          203 :     if (!html) return NULL;
     227              : 
     228          202 :     HtmlNode *root = node_elem(strdup("__root__"));
     229          202 :     if (!root) return NULL;
     230              : 
     231          202 :     Stack stk; stk.top = 0;
     232          202 :     stk_push(&stk, root);
     233              : 
     234          202 :     PState state = PS_TEXT;
     235          202 :     Buf tb = {0}; /* text accumulator */
     236          202 :     Buf nb = {0}; /* tag/attr name */
     237          202 :     Buf vb = {0}; /* attr value */
     238          202 :     char qc = '"';
     239              : 
     240          202 :     HtmlNode *cur = NULL; /* element being built */
     241              : 
     242          202 :     const char *p = html;
     243         7833 :     while (*p) {
     244         7632 :         char c = *p;
     245              : 
     246         7632 :         switch (state) {
     247              : 
     248              :         /* ── Text content ── */
     249         2238 :         case PS_TEXT:
     250         2238 :             if (c == '<') { flush_text(&stk, &tb); state = PS_LT; }
     251         1708 :             else buf_push(&tb, c);
     252         2238 :             break;
     253              : 
     254              :         /* ── Just saw '<' ── */
     255          530 :         case PS_LT:
     256          530 :             if (c == '/') { state = PS_CLOSE; }
     257          335 :             else if (c == '!') { state = PS_BANG; }
     258          330 :             else if (isalpha((unsigned char)c) || c == '_') {
     259          329 :                 buf_push(&nb, c); state = PS_OPEN;
     260              :             } else {
     261              :                 /* Not a real tag — treat '<' as text */
     262            1 :                 buf_push(&tb, '<'); buf_push(&tb, c); state = PS_TEXT;
     263              :             }
     264          530 :             break;
     265              : 
     266              :         /* ── <!... ── */
     267            5 :         case PS_BANG:
     268            5 :             if (c == '-') {
     269              :                 /* Peek: if next is also '-' it's a comment */
     270            4 :                 if (p[1] == '-') { p++; state = PS_CMT; }
     271            1 :                 else state = PS_DECL;
     272            1 :             } else state = PS_DECL;
     273            5 :             break;
     274              : 
     275           21 :         case PS_CMT:
     276           21 :             if (c == '-') state = PS_CMT_D1;
     277           21 :             break;
     278            5 :         case PS_CMT_D1:
     279            5 :             if (c == '-') state = PS_CMT_DD;
     280            1 :             else state = PS_CMT;
     281            5 :             break;
     282            4 :         case PS_CMT_DD:
     283            4 :             if (c == '>') state = PS_TEXT;
     284            1 :             else if (c != '-') state = PS_CMT;
     285            4 :             break;
     286           26 :         case PS_DECL:
     287           26 :             if (c == '>') state = PS_TEXT;
     288           26 :             break;
     289              : 
     290              :         /* ── Close tag name ── */
     291          767 :         case PS_CLOSE:
     292          767 :             if (isalnum((unsigned char)c) || c == '-' || c == '_' || c == ':' || c == '.') {
     293          572 :                 buf_push(&nb, c);
     294          195 :             } else if (c == '>') {
     295          195 :                 if (nb.d) { tolower_buf(nb.d); stk_close(&stk, nb.d); }
     296          195 :                 buf_free(&nb);
     297          195 :                 state = PS_TEXT;
     298              :             }
     299              :             /* Skip whitespace and other chars inside </tag   > */
     300          767 :             break;
     301              : 
     302              :         /* ── Open tag name ── */
     303          919 :         case PS_OPEN: {
     304          919 :             int done = 0;
     305          919 :             if (isalnum((unsigned char)c) || c == '-' || c == '_' || c == ':' || c == '.') {
     306          590 :                 buf_push(&nb, c);
     307              :             } else {
     308          329 :                 done = 1;
     309              :             }
     310          919 :             if (done && nb.d) {
     311          329 :                 tolower_buf(nb.d);
     312              : 
     313              :                 /* script / style: skip opening tag then body entirely */
     314          329 :                 if (strcmp(nb.d, "script") == 0 || strcmp(nb.d, "style") == 0) {
     315           10 :                     char close_tag[16];
     316           10 :                     snprintf(close_tag, sizeof(close_tag), "</%s", nb.d);
     317           10 :                     buf_free(&nb);
     318              :                     /* Skip to end of opening tag */
     319           10 :                     while (*p && *p != '>') p++;
     320           10 :                     if (*p) p++; /* skip '>' */
     321              :                     /* Find closing tag (case-insensitive) */
     322           10 :                     const char *end = strcasestr(p, close_tag);
     323           10 :                     if (end) {
     324            9 :                         p = end + strlen(close_tag);
     325            9 :                         while (*p && *p != '>') p++;
     326              :                         /* p now points at '>' of closing tag */
     327              :                     } else {
     328            1 :                         p += strlen(p); /* skip to end */
     329            1 :                         if (*p) {} /* p already at '\0' */
     330              :                     }
     331           10 :                     state = PS_TEXT;
     332           10 :                     break;
     333              :                 }
     334              : 
     335          319 :                 char *tname = buf_take(&nb);
     336          319 :                 cur = node_elem(tname);
     337          319 :                 if (!cur) { state = PS_TEXT; break; }
     338              : 
     339          319 :                 if (c == '>') {
     340          206 :                     commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
     341          113 :                 } else if (c == '/') {
     342            1 :                     commit_elem(&stk, cur, 1); cur = NULL;
     343            2 :                     while (*p && *p != '>') p++;
     344            1 :                     state = PS_TEXT;
     345          112 :                 } else if (isspace((unsigned char)c)) {
     346          111 :                     state = PS_ATTR_SEP;
     347              :                 } else {
     348              :                     /* Attribute name starting immediately */
     349            1 :                     buf_push(&nb, c); state = PS_ATTR_NM;
     350              :                 }
     351              :             }
     352          909 :             break;
     353              :         }
     354              : 
     355              :         /* ── Between attributes ── */
     356          235 :         case PS_ATTR_SEP:
     357          235 :             if (c == '>') {
     358          108 :                 commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
     359          127 :             } else if (c == '/') {
     360            1 :                 commit_elem(&stk, cur, 1); cur = NULL;
     361            2 :                 while (*p && *p != '>') p++;
     362            1 :                 state = PS_TEXT;
     363          126 :             } else if (!isspace((unsigned char)c)) {
     364          119 :                 buf_push(&nb, c); state = PS_ATTR_NM;
     365              :             }
     366          235 :             break;
     367              : 
     368              :         /* ── Attribute name ── */
     369          580 :         case PS_ATTR_NM:
     370          580 :             if (c == '=') { state = PS_ATTR_EQ; }
     371          463 :             else if (c == '>') {
     372              :                 /* Boolean attr */
     373            2 :                 if (cur && nb.d && nb.n > 0)
     374            2 :                     attr_prepend(cur, buf_take(&nb), strdup(""));
     375            0 :                 else buf_free(&nb);
     376            2 :                 commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
     377          461 :             } else if (isspace((unsigned char)c)) {
     378            1 :                 if (cur && nb.d && nb.n > 0)
     379            1 :                     attr_prepend(cur, buf_take(&nb), strdup(""));
     380            0 :                 else buf_free(&nb);
     381            1 :                 state = PS_ATTR_SEP;
     382              :             } else {
     383          460 :                 buf_push(&nb, c);
     384              :             }
     385          580 :             break;
     386              : 
     387              :         /* ── After '=' ── */
     388          117 :         case PS_ATTR_EQ:
     389          117 :             if (c == '"' || c == '\'') { qc = c; state = PS_ATTR_VQ; }
     390            2 :             else if (!isspace((unsigned char)c)) { buf_push(&vb, c); state = PS_ATTR_VU; }
     391          117 :             break;
     392              : 
     393              :         /* ── Quoted attribute value ── */
     394         2177 :         case PS_ATTR_VQ:
     395         2177 :             if (c == qc) {
     396          115 :                 if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
     397            0 :                 else { buf_free(&nb); buf_free(&vb); }
     398          115 :                 state = PS_ATTR_SEP;
     399         2062 :             } else buf_push(&vb, c);
     400         2177 :             break;
     401              : 
     402              :         /* ── Unquoted attribute value ── */
     403            8 :         case PS_ATTR_VU:
     404            8 :             if (isspace((unsigned char)c)) {
     405            1 :                 if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
     406            0 :                 else { buf_free(&nb); buf_free(&vb); }
     407            1 :                 state = PS_ATTR_SEP;
     408            7 :             } else if (c == '>') {
     409            1 :                 if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
     410            0 :                 else { buf_free(&nb); buf_free(&vb); }
     411            1 :                 commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
     412            6 :             } else buf_push(&vb, c);
     413            8 :             break;
     414              : 
     415              :         } /* switch */
     416              : 
     417         7632 :         if (*p) p++;
     418            1 :         else break;
     419              :     } /* while */
     420              : 
     421          202 :     flush_text(&stk, &tb);
     422          202 :     if (cur) html_node_free(cur); /* partially built element */
     423          202 :     buf_free(&nb); buf_free(&vb);
     424          202 :     return root;
     425              : }
        

Generated by: LCOV version 2.0-1