Line data Source code
1 : #include "html_parser.h"
2 : #include <stdio.h>
3 : #include <stdlib.h>
4 : #include <string.h>
5 : #include <ctype.h>
6 :
7 : /* ── Void elements (never have children) ───────────────────────────── */
8 :
9 : static const char * const VOID_TAGS[] = {
10 : "area","base","br","col","embed","hr","img","input",
11 : "link","meta","param","source","track","wbr", NULL
12 : };
13 864 : static int is_void(const char *t) {
14 864 : if (!t) return 0;
15 12416 : for (int i = 0; VOID_TAGS[i]; i++)
16 11616 : if (strcmp(t, VOID_TAGS[i]) == 0) return 1;
17 800 : return 0;
18 : }
19 :
20 : /* Auto-close: opening one of these implicitly closes the previous sibling */
21 : static const char * const AUTO_CLOSE_TAGS[] = {
22 : "li","p","dt","dd","tr","td","th", NULL
23 : };
24 864 : static int is_auto_close(const char *t) {
25 864 : if (!t) return 0;
26 5408 : for (int i = 0; AUTO_CLOSE_TAGS[i]; i++)
27 4832 : if (strcmp(t, AUTO_CLOSE_TAGS[i]) == 0) return 1;
28 576 : return 0;
29 : }
30 :
31 : /* ── Dynamic string ─────────────────────────────────────────────────── */
32 :
33 : typedef struct { char *d; size_t n, cap; } Buf;
34 :
35 16928 : static int buf_push(Buf *b, char c) {
36 16928 : if (b->n + 1 >= b->cap) {
37 2784 : size_t nc = b->cap ? b->cap * 2 : 64;
38 2784 : char *t = realloc(b->d, nc);
39 2784 : if (!t) return 0;
40 2784 : b->d = t; b->cap = nc;
41 : }
42 16928 : b->d[b->n++] = c;
43 16928 : b->d[b->n] = '\0';
44 16928 : return 1;
45 : }
46 1280 : static char *buf_take(Buf *b) {
47 1280 : char *r = b->d; b->d = NULL; b->n = b->cap = 0; return r;
48 : }
49 2592 : static void buf_free(Buf *b) { free(b->d); b->d = NULL; b->n = b->cap = 0; }
50 :
51 : /* ── Node helpers ───────────────────────────────────────────────────── */
52 :
53 896 : static HtmlNode *node_elem(char *tag) {
54 896 : HtmlNode *n = calloc(1, sizeof(*n));
55 896 : if (!n) { free(tag); return NULL; }
56 896 : n->type = HTML_NODE_ELEMENT; n->tag = tag; return n;
57 : }
58 736 : static HtmlNode *node_text(char *text) {
59 736 : HtmlNode *n = calloc(1, sizeof(*n));
60 736 : if (!n) { free(text); return NULL; }
61 736 : n->type = HTML_NODE_TEXT; n->text = text; return n;
62 : }
63 1632 : static void attr_free_list(HtmlAttr *a) {
64 1856 : while (a) { HtmlAttr *nx = a->next; free(a->name); free(a->value); free(a); a = nx; }
65 1632 : }
66 3296 : void html_node_free(HtmlNode *node) {
67 3296 : if (!node) return;
68 1632 : html_node_free(node->first_child);
69 1632 : html_node_free(node->next_sibling);
70 1632 : attr_free_list(node->attrs);
71 1632 : free(node->tag); free(node->text); free(node);
72 : }
73 960 : const char *html_attr_get(const HtmlNode *node, const char *name) {
74 960 : if (!node || !name) return NULL;
75 1120 : for (HtmlAttr *a = node->attrs; a; a = a->next)
76 352 : if (strcmp(a->name, name) == 0) return a->value;
77 768 : return NULL;
78 : }
79 1600 : static void node_add_child(HtmlNode *parent, HtmlNode *child) {
80 1600 : if (!parent || !child) return;
81 1600 : child->parent = parent;
82 1600 : if (!parent->first_child) { parent->first_child = child; return; }
83 800 : HtmlNode *last = parent->first_child;
84 2816 : while (last->next_sibling) last = last->next_sibling;
85 800 : last->next_sibling = child;
86 : }
87 224 : static void attr_prepend(HtmlNode *elem, char *name, char *value) {
88 224 : HtmlAttr *a = calloc(1, sizeof(*a));
89 224 : if (!a) { free(name); free(value); return; }
90 224 : a->name = name; a->value = value; a->next = elem->attrs; elem->attrs = a;
91 : }
92 :
93 : /* ── Entity decoder ─────────────────────────────────────────────────── */
94 :
95 : static const struct { const char *name; unsigned cp; } ENTITIES[] = {
96 : {"amp",'&'},{"lt",'<'},{"gt",'>'},{"quot",'"'},{"apos",'\''},
97 : {"nbsp",0xA0},{"copy",0xA9},{"reg",0xAE},{"mdash",0x2014},
98 : {"ndash",0x2013},{"hellip",0x2026},{"laquo",0xAB},{"raquo",0xBB},
99 : /* zero-width / formatting */
100 : {"zwnj",0x200C},{"zwj",0x200D},{"shy",0xAD},
101 : /* common symbols */
102 : {"bull",0x2022},{"middot",0xB7},{"trade",0x2122},{"euro",0x20AC},
103 : {"pound",0xA3},{"cent",0xA2},{"yen",0xA5},
104 : {"times",0xD7},{"divide",0xF7},
105 : /* Latin-1 supplement (HTML 4, U+00C0–U+00FF) */
106 : {"Agrave",0xC0},{"agrave",0xE0},{"Aacute",0xC1},{"aacute",0xE1},
107 : {"Acirc", 0xC2},{"acirc", 0xE2},{"Atilde",0xC3},{"atilde",0xE3},
108 : {"Auml", 0xC4},{"auml", 0xE4},{"Aring", 0xC5},{"aring", 0xE5},
109 : {"AElig", 0xC6},{"aelig", 0xE6},{"Ccedil",0xC7},{"ccedil",0xE7},
110 : {"Egrave",0xC8},{"egrave",0xE8},{"Eacute",0xC9},{"eacute",0xE9},
111 : {"Ecirc", 0xCA},{"ecirc", 0xEA},{"Euml", 0xCB},{"euml", 0xEB},
112 : {"Igrave",0xCC},{"igrave",0xEC},{"Iacute",0xCD},{"iacute",0xED},
113 : {"Icirc", 0xCE},{"icirc", 0xEE},{"Iuml", 0xCF},{"iuml", 0xEF},
114 : {"ETH", 0xD0},{"eth", 0xF0},{"Ntilde",0xD1},{"ntilde",0xF1},
115 : {"Ograve",0xD2},{"ograve",0xF2},{"Oacute",0xD3},{"oacute",0xF3},
116 : {"Ocirc", 0xD4},{"ocirc", 0xF4},{"Otilde",0xD5},{"otilde",0xF5},
117 : {"Ouml", 0xD6},{"ouml", 0xF6},{"Oslash",0xD8},{"oslash",0xF8},
118 : {"Ugrave",0xD9},{"ugrave",0xF9},{"Uacute",0xDA},{"uacute",0xFA},
119 : {"Ucirc", 0xDB},{"ucirc", 0xFB},{"Uuml", 0xDC},{"uuml", 0xFC},
120 : {"Yacute",0xDD},{"yacute",0xFD},{"THORN", 0xDE},{"thorn", 0xFE},
121 : {"szlig", 0xDF},{"yuml", 0xFF},{"iexcl",0xA1},{"iquest",0xBF},
122 : {"Iuml", 0xCF},{"ordf", 0xAA},{"ordm", 0xBA},
123 : {NULL,0}
124 : };
125 128 : static int cp_to_utf8(unsigned cp, char *out) {
126 128 : if (cp < 0x80) { out[0]=(char)cp; return 1; }
127 64 : if (cp < 0x800) { out[0]=(char)(0xC0|(cp>>6)); out[1]=(char)(0x80|(cp&0x3F)); return 2; }
128 32 : if (cp < 0x10000) {
129 32 : out[0]=(char)(0xE0|(cp>>12)); out[1]=(char)(0x80|((cp>>6)&0x3F));
130 32 : out[2]=(char)(0x80|(cp&0x3F)); return 3;
131 : }
132 0 : if (cp <= 0x10FFFF) {
133 0 : out[0]=(char)(0xF0|(cp>>18)); out[1]=(char)(0x80|((cp>>12)&0x3F));
134 0 : out[2]=(char)(0x80|((cp>>6)&0x3F)); out[3]=(char)(0x80|(cp&0x3F)); return 4;
135 : }
136 0 : return 0;
137 : }
138 736 : static char *decode_entities(const char *in) {
139 736 : if (!in) return NULL;
140 736 : size_t len = strlen(in);
141 736 : char *out = malloc(len * 4 + 1);
142 736 : if (!out) return NULL;
143 736 : size_t n = 0;
144 6816 : for (size_t i = 0; i < len; ) {
145 6080 : if (in[i] != '&') { out[n++] = in[i++]; continue; }
146 : /* find ';' within reasonable range */
147 128 : size_t j = i + 1;
148 512 : while (j < len && in[j] != ';' && j - i < 14) j++;
149 128 : if (j < len && in[j] == ';' && j > i + 1) {
150 128 : const char *e = in + i + 1;
151 128 : size_t el = j - i - 1;
152 128 : unsigned cp = 0; int matched = 0;
153 128 : if (el > 0 && e[0] == '#') {
154 0 : if (el > 1 && (e[1]=='x'||e[1]=='X')) {
155 0 : for (size_t k=2;k<el;k++) {
156 0 : unsigned char c=(unsigned char)e[k];
157 0 : if (c>='0'&&c<='9') cp=cp*16+(c-'0');
158 0 : else if (c>='a'&&c<='f') cp=cp*16+(c-'a'+10);
159 0 : else if (c>='A'&&c<='F') cp=cp*16+(c-'A'+10);
160 : }
161 : } else {
162 0 : for (size_t k=1;k<el;k++)
163 0 : if (e[k]>='0'&&e[k]<='9') cp=cp*10+(unsigned char)(e[k]-'0');
164 : }
165 0 : matched = (cp > 0);
166 128 : } else if (el < 10) {
167 128 : char nm[10]={0}; memcpy(nm,e,el);
168 1824 : for (int k=0; ENTITIES[k].name; k++)
169 1824 : if (strcmp(ENTITIES[k].name,nm)==0) { cp=ENTITIES[k].cp; matched=1; break; }
170 : }
171 128 : if (matched) {
172 128 : char utf8[5]={0}; int nb=cp_to_utf8(cp,utf8);
173 352 : for (int k=0;k<nb;k++) out[n++]=utf8[k];
174 128 : i = j + 1; continue;
175 : }
176 : }
177 0 : out[n++] = in[i++];
178 : }
179 736 : out[n] = '\0';
180 736 : return out;
181 : }
182 :
183 : /* ── Open-element stack ─────────────────────────────────────────────── */
184 :
185 : #define STACK_MAX 256
186 : typedef struct { HtmlNode *n[STACK_MAX]; int top; } Stack;
187 832 : static void stk_push(Stack *s, HtmlNode *n) { if (s->top<STACK_MAX) s->n[s->top++]=n; }
188 1888 : static HtmlNode *stk_top(Stack *s) { return s->top>0 ? s->n[s->top-1] : NULL; }
189 32 : static void stk_pop(Stack *s) { if (s->top>0) s->top--; }
190 : /* Pop up to and including the nearest matching tag; return 1 if found */
191 736 : static int stk_close(Stack *s, const char *tag) {
192 768 : for (int i=s->top-1; i>=0; i--)
193 768 : if (s->n[i]->tag && strcmp(s->n[i]->tag,tag)==0) { s->top=i; return 1; }
194 0 : return 0;
195 : }
196 :
197 : /* ── Parser ─────────────────────────────────────────────────────────── */
198 :
199 : typedef enum {
200 : PS_TEXT,
201 : PS_LT, /* saw '<' */
202 : PS_BANG, /* saw '<!' */
203 : PS_CMT, /* inside <!-- --> */
204 : PS_CMT_D1, /* saw '-' inside comment */
205 : PS_CMT_DD, /* saw '--' inside comment */
206 : PS_DECL, /* inside <!DOCTYPE ...> */
207 : PS_OPEN, /* reading open tag name */
208 : PS_CLOSE, /* reading close tag name */
209 : PS_ATTR_SEP, /* between attributes */
210 : PS_ATTR_NM, /* reading attribute name */
211 : PS_ATTR_EQ, /* saw '=' */
212 : PS_ATTR_VQ, /* in quoted attribute value */
213 : PS_ATTR_VU, /* in unquoted attribute value */
214 : } PState;
215 :
216 1632 : static void tolower_buf(char *s) {
217 5920 : for (; s && *s; s++) *s = (char)tolower((unsigned char)*s);
218 1632 : }
219 :
220 : /* Flush text buffer as a text node child of current stack top */
221 1760 : static void flush_text(Stack *stk, Buf *tb) {
222 1760 : if (!tb->d || !tb->n) { buf_free(tb); return; }
223 736 : char *decoded = decode_entities(tb->d);
224 736 : HtmlNode *tn = node_text(decoded ? decoded : strdup(tb->d));
225 736 : if (!decoded) free(tb->d);
226 736 : buf_free(tb);
227 736 : if (tn) node_add_child(stk_top(stk), tn);
228 : }
229 :
230 : /* After completing a tag build: add to tree, push if non-void */
231 864 : static void commit_elem(Stack *stk, HtmlNode *elem, int self_close) {
232 864 : if (!elem) return;
233 864 : if (is_auto_close(elem->tag)) {
234 288 : HtmlNode *top = stk_top(stk);
235 288 : if (top && top->tag && strcmp(top->tag, elem->tag) == 0)
236 32 : stk_pop(stk);
237 : }
238 864 : node_add_child(stk_top(stk), elem);
239 864 : if (!self_close && !is_void(elem->tag))
240 800 : stk_push(stk, elem);
241 : }
242 :
243 32 : HtmlNode *html_parse(const char *html) {
244 32 : if (!html) return NULL;
245 :
246 32 : HtmlNode *root = node_elem(strdup("__root__"));
247 32 : if (!root) return NULL;
248 :
249 32 : Stack stk; stk.top = 0;
250 32 : stk_push(&stk, root);
251 :
252 32 : PState state = PS_TEXT;
253 32 : Buf tb = {0}; /* text accumulator */
254 32 : Buf nb = {0}; /* tag/attr name */
255 32 : Buf vb = {0}; /* attr value */
256 32 : char qc = '"';
257 :
258 32 : HtmlNode *cur = NULL; /* element being built */
259 :
260 32 : const char *p = html;
261 22624 : while (*p) {
262 22592 : char c = *p;
263 :
264 22592 : switch (state) {
265 :
266 : /* ── Text content ── */
267 8256 : case PS_TEXT:
268 8256 : if (c == '<') { flush_text(&stk, &tb); state = PS_LT; }
269 6528 : else buf_push(&tb, c);
270 8256 : break;
271 :
272 : /* ── Just saw '<' ── */
273 1728 : case PS_LT:
274 1728 : if (c == '/') { state = PS_CLOSE; }
275 992 : else if (c == '!') { state = PS_BANG; }
276 928 : else if (isalpha((unsigned char)c) || c == '_') {
277 896 : buf_push(&nb, c); state = PS_OPEN;
278 : } else {
279 : /* Not a real tag — treat '<' as text */
280 32 : buf_push(&tb, '<'); buf_push(&tb, c); state = PS_TEXT;
281 : }
282 1728 : break;
283 :
284 : /* ── <!... ── */
285 64 : case PS_BANG:
286 64 : if (c == '-') {
287 : /* Peek: if next is also '-' it's a comment */
288 32 : if (p[1] == '-') { p++; state = PS_CMT; }
289 0 : else state = PS_DECL;
290 32 : } else state = PS_DECL;
291 64 : break;
292 :
293 224 : case PS_CMT:
294 224 : if (c == '-') state = PS_CMT_D1;
295 224 : break;
296 64 : case PS_CMT_D1:
297 64 : if (c == '-') state = PS_CMT_DD;
298 0 : else state = PS_CMT;
299 64 : break;
300 64 : case PS_CMT_DD:
301 64 : if (c == '>') state = PS_TEXT;
302 32 : else if (c != '-') state = PS_CMT;
303 64 : break;
304 384 : case PS_DECL:
305 384 : if (c == '>') state = PS_TEXT;
306 384 : break;
307 :
308 : /* ── Close tag name ── */
309 2656 : case PS_CLOSE:
310 2656 : if (isalnum((unsigned char)c) || c == '-' || c == '_' || c == ':' || c == '.') {
311 1920 : buf_push(&nb, c);
312 736 : } else if (c == '>') {
313 736 : if (nb.d) { tolower_buf(nb.d); stk_close(&stk, nb.d); }
314 736 : buf_free(&nb);
315 736 : state = PS_TEXT;
316 : }
317 : /* Skip whitespace and other chars inside </tag > */
318 2656 : break;
319 :
320 : /* ── Open tag name ── */
321 2368 : case PS_OPEN: {
322 2368 : int done = 0;
323 2368 : if (isalnum((unsigned char)c) || c == '-' || c == '_' || c == ':' || c == '.') {
324 1472 : buf_push(&nb, c);
325 : } else {
326 896 : done = 1;
327 : }
328 2368 : if (done && nb.d) {
329 896 : tolower_buf(nb.d);
330 :
331 : /* script / style: skip opening tag then body entirely */
332 896 : if (strcmp(nb.d, "script") == 0 || strcmp(nb.d, "style") == 0) {
333 : char close_tag[16];
334 32 : snprintf(close_tag, sizeof(close_tag), "</%s", nb.d);
335 32 : buf_free(&nb);
336 : /* Skip to end of opening tag */
337 32 : while (*p && *p != '>') p++;
338 32 : if (*p) p++; /* skip '>' */
339 : /* Find closing tag (case-insensitive) */
340 32 : const char *end = strcasestr(p, close_tag);
341 32 : if (end) {
342 32 : p = end + strlen(close_tag);
343 32 : while (*p && *p != '>') p++;
344 : /* p now points at '>' of closing tag */
345 : } else {
346 0 : p += strlen(p); /* skip to end */
347 0 : if (*p) {} /* p already at '\0' */
348 : }
349 32 : state = PS_TEXT;
350 32 : break;
351 : }
352 :
353 864 : char *tname = buf_take(&nb);
354 864 : cur = node_elem(tname);
355 864 : if (!cur) { state = PS_TEXT; break; }
356 :
357 864 : if (c == '>') {
358 672 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
359 192 : } else if (c == '/') {
360 0 : commit_elem(&stk, cur, 1); cur = NULL;
361 0 : while (*p && *p != '>') p++;
362 0 : state = PS_TEXT;
363 192 : } else if (isspace((unsigned char)c)) {
364 192 : state = PS_ATTR_SEP;
365 : } else {
366 : /* Attribute name starting immediately */
367 0 : buf_push(&nb, c); state = PS_ATTR_NM;
368 : }
369 : }
370 2336 : break;
371 : }
372 :
373 : /* ── Between attributes ── */
374 384 : case PS_ATTR_SEP:
375 384 : if (c == '>') {
376 128 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
377 256 : } else if (c == '/') {
378 0 : commit_elem(&stk, cur, 1); cur = NULL;
379 0 : while (*p && *p != '>') p++;
380 0 : state = PS_TEXT;
381 256 : } else if (!isspace((unsigned char)c)) {
382 224 : buf_push(&nb, c); state = PS_ATTR_NM;
383 : }
384 384 : break;
385 :
386 : /* ── Attribute name ── */
387 1088 : case PS_ATTR_NM:
388 1088 : if (c == '=') { state = PS_ATTR_EQ; }
389 896 : else if (c == '>') {
390 : /* Boolean attr */
391 32 : if (cur && nb.d && nb.n > 0)
392 32 : attr_prepend(cur, buf_take(&nb), strdup(""));
393 0 : else buf_free(&nb);
394 32 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
395 864 : } else if (isspace((unsigned char)c)) {
396 0 : if (cur && nb.d && nb.n > 0)
397 0 : attr_prepend(cur, buf_take(&nb), strdup(""));
398 0 : else buf_free(&nb);
399 0 : state = PS_ATTR_SEP;
400 : } else {
401 864 : buf_push(&nb, c);
402 : }
403 1088 : break;
404 :
405 : /* ── After '=' ── */
406 192 : case PS_ATTR_EQ:
407 192 : if (c == '"' || c == '\'') { qc = c; state = PS_ATTR_VQ; }
408 32 : else if (!isspace((unsigned char)c)) { buf_push(&vb, c); state = PS_ATTR_VU; }
409 192 : break;
410 :
411 : /* ── Quoted attribute value ── */
412 5024 : case PS_ATTR_VQ:
413 5024 : if (c == qc) {
414 160 : if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
415 0 : else { buf_free(&nb); buf_free(&vb); }
416 160 : state = PS_ATTR_SEP;
417 4864 : } else buf_push(&vb, c);
418 5024 : break;
419 :
420 : /* ── Unquoted attribute value ── */
421 96 : case PS_ATTR_VU:
422 96 : if (isspace((unsigned char)c)) {
423 0 : if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
424 0 : else { buf_free(&nb); buf_free(&vb); }
425 0 : state = PS_ATTR_SEP;
426 96 : } else if (c == '>') {
427 32 : if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
428 0 : else { buf_free(&nb); buf_free(&vb); }
429 32 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
430 64 : } else buf_push(&vb, c);
431 96 : break;
432 :
433 : } /* switch */
434 :
435 22592 : if (*p) p++;
436 0 : else break;
437 : } /* while */
438 :
439 32 : flush_text(&stk, &tb);
440 32 : if (cur) html_node_free(cur); /* partially built element */
441 32 : buf_free(&nb); buf_free(&vb);
442 32 : return root;
443 : }
|