Line data Source code
1 : #include "html_parser.h"
2 : #include <stdio.h>
3 : #include <stdlib.h>
4 : #include <string.h>
5 : #include <ctype.h>
6 :
7 : /* ── Void elements (never have children) ───────────────────────────── */
8 :
9 : static const char * const VOID_TAGS[] = {
10 : "area","base","br","col","embed","hr","img","input",
11 : "link","meta","param","source","track","wbr", NULL
12 : };
13 1134 : static int is_void(const char *t) {
14 1134 : if (!t) return 0;
15 16350 : for (int i = 0; VOID_TAGS[i]; i++)
16 15294 : if (strcmp(t, VOID_TAGS[i]) == 0) return 1;
17 1056 : return 0;
18 : }
19 :
20 : /* Auto-close: opening one of these implicitly closes the previous sibling */
21 : static const char * const AUTO_CLOSE_TAGS[] = {
22 : "li","p","dt","dd","tr","td","th", NULL
23 : };
24 1136 : static int is_auto_close(const char *t) {
25 1136 : if (!t) return 0;
26 7404 : for (int i = 0; AUTO_CLOSE_TAGS[i]; i++)
27 6599 : if (strcmp(t, AUTO_CLOSE_TAGS[i]) == 0) return 1;
28 805 : return 0;
29 : }
30 :
31 : /* ── Dynamic string ─────────────────────────────────────────────────── */
32 :
33 : typedef struct { char *d; size_t n, cap; } Buf;
34 :
35 22426 : static int buf_push(Buf *b, char c) {
36 22426 : if (b->n + 1 >= b->cap) {
37 3749 : size_t nc = b->cap ? b->cap * 2 : 64;
38 3749 : char *t = realloc(b->d, nc);
39 3749 : if (!t) return 0;
40 3749 : b->d = t; b->cap = nc;
41 : }
42 22426 : b->d[b->n++] = c;
43 22426 : b->d[b->n] = '\0';
44 22426 : return 1;
45 : }
46 1791 : static char *buf_take(Buf *b) {
47 1791 : char *r = b->d; b->d = NULL; b->n = b->cap = 0; return r;
48 : }
49 3857 : static void buf_free(Buf *b) { free(b->d); b->d = NULL; b->n = b->cap = 0; }
50 :
51 : /* ── Node helpers ───────────────────────────────────────────────────── */
52 :
53 1373 : static HtmlNode *node_elem(char *tag) {
54 1373 : HtmlNode *n = calloc(1, sizeof(*n));
55 1373 : if (!n) { free(tag); return NULL; }
56 1373 : n->type = HTML_NODE_ELEMENT; n->tag = tag; return n;
57 : }
58 1001 : static HtmlNode *node_text(char *text) {
59 1001 : HtmlNode *n = calloc(1, sizeof(*n));
60 1001 : if (!n) { free(text); return NULL; }
61 1001 : n->type = HTML_NODE_TEXT; n->text = text; return n;
62 : }
63 2374 : static void attr_free_list(HtmlAttr *a) {
64 2719 : while (a) { HtmlAttr *nx = a->next; free(a->name); free(a->value); free(a); a = nx; }
65 2374 : }
66 4986 : void html_node_free(HtmlNode *node) {
67 4986 : if (!node) return;
68 2374 : html_node_free(node->first_child);
69 2374 : html_node_free(node->next_sibling);
70 2374 : attr_free_list(node->attrs);
71 2374 : free(node->tag); free(node->text); free(node);
72 : }
73 1256 : const char *html_attr_get(const HtmlNode *node, const char *name) {
74 1256 : if (!node || !name) return NULL;
75 1441 : for (HtmlAttr *a = node->attrs; a; a = a->next)
76 509 : if (strcmp(a->name, name) == 0) return a->value;
77 932 : return NULL;
78 : }
79 2137 : static void node_add_child(HtmlNode *parent, HtmlNode *child) {
80 2137 : if (!parent || !child) return;
81 2137 : child->parent = parent;
82 2137 : if (!parent->first_child) { parent->first_child = child; return; }
83 888 : HtmlNode *last = parent->first_child;
84 2921 : while (last->next_sibling) last = last->next_sibling;
85 888 : last->next_sibling = child;
86 : }
87 345 : static void attr_prepend(HtmlNode *elem, char *name, char *value) {
88 345 : HtmlAttr *a = calloc(1, sizeof(*a));
89 345 : if (!a) { free(name); free(value); return; }
90 345 : a->name = name; a->value = value; a->next = elem->attrs; elem->attrs = a;
91 : }
92 :
93 : /* ── Entity decoder ─────────────────────────────────────────────────── */
94 :
95 : static const struct { const char *name; unsigned cp; } ENTITIES[] = {
96 : {"amp",'&'},{"lt",'<'},{"gt",'>'},{"quot",'"'},{"apos",'\''},
97 : {"nbsp",0xA0},{"copy",0xA9},{"reg",0xAE},{"mdash",0x2014},
98 : {"ndash",0x2013},{"hellip",0x2026},{"laquo",0xAB},{"raquo",0xBB},
99 : /* zero-width / formatting */
100 : {"zwnj",0x200C},{"zwj",0x200D},{"shy",0xAD},
101 : /* common symbols */
102 : {"bull",0x2022},{"middot",0xB7},{"trade",0x2122},{"euro",0x20AC},
103 : {"pound",0xA3},{"cent",0xA2},{"yen",0xA5},
104 : {"times",0xD7},{"divide",0xF7},
105 : /* Latin-1 supplement (HTML 4, U+00C0–U+00FF) */
106 : {"Agrave",0xC0},{"agrave",0xE0},{"Aacute",0xC1},{"aacute",0xE1},
107 : {"Acirc", 0xC2},{"acirc", 0xE2},{"Atilde",0xC3},{"atilde",0xE3},
108 : {"Auml", 0xC4},{"auml", 0xE4},{"Aring", 0xC5},{"aring", 0xE5},
109 : {"AElig", 0xC6},{"aelig", 0xE6},{"Ccedil",0xC7},{"ccedil",0xE7},
110 : {"Egrave",0xC8},{"egrave",0xE8},{"Eacute",0xC9},{"eacute",0xE9},
111 : {"Ecirc", 0xCA},{"ecirc", 0xEA},{"Euml", 0xCB},{"euml", 0xEB},
112 : {"Igrave",0xCC},{"igrave",0xEC},{"Iacute",0xCD},{"iacute",0xED},
113 : {"Icirc", 0xCE},{"icirc", 0xEE},{"Iuml", 0xCF},{"iuml", 0xEF},
114 : {"ETH", 0xD0},{"eth", 0xF0},{"Ntilde",0xD1},{"ntilde",0xF1},
115 : {"Ograve",0xD2},{"ograve",0xF2},{"Oacute",0xD3},{"oacute",0xF3},
116 : {"Ocirc", 0xD4},{"ocirc", 0xF4},{"Otilde",0xD5},{"otilde",0xF5},
117 : {"Ouml", 0xD6},{"ouml", 0xF6},{"Oslash",0xD8},{"oslash",0xF8},
118 : {"Ugrave",0xD9},{"ugrave",0xF9},{"Uacute",0xDA},{"uacute",0xFA},
119 : {"Ucirc", 0xDB},{"ucirc", 0xFB},{"Uuml", 0xDC},{"uuml", 0xFC},
120 : {"Yacute",0xDD},{"yacute",0xFD},{"THORN", 0xDE},{"thorn", 0xFE},
121 : {"szlig", 0xDF},{"yuml", 0xFF},{"iexcl",0xA1},{"iquest",0xBF},
122 : {"Iuml", 0xCF},{"ordf", 0xAA},{"ordm", 0xBA},
123 : {NULL,0}
124 : };
125 146 : static int cp_to_utf8(unsigned cp, char *out) {
126 146 : if (cp < 0x80) { out[0]=(char)cp; return 1; }
127 76 : if (cp < 0x800) { out[0]=(char)(0xC0|(cp>>6)); out[1]=(char)(0x80|(cp&0x3F)); return 2; }
128 36 : if (cp < 0x10000) {
129 34 : out[0]=(char)(0xE0|(cp>>12)); out[1]=(char)(0x80|((cp>>6)&0x3F));
130 34 : out[2]=(char)(0x80|(cp&0x3F)); return 3;
131 : }
132 2 : if (cp <= 0x10FFFF) {
133 1 : out[0]=(char)(0xF0|(cp>>18)); out[1]=(char)(0x80|((cp>>12)&0x3F));
134 1 : out[2]=(char)(0x80|((cp>>6)&0x3F)); out[3]=(char)(0x80|(cp&0x3F)); return 4;
135 : }
136 1 : return 0;
137 : }
138 1001 : static char *decode_entities(const char *in) {
139 1001 : if (!in) return NULL;
140 1001 : size_t len = strlen(in);
141 1001 : char *out = malloc(len * 4 + 1);
142 1001 : if (!out) return NULL;
143 1001 : size_t n = 0;
144 8495 : for (size_t i = 0; i < len; ) {
145 7494 : if (in[i] != '&') { out[n++] = in[i++]; continue; }
146 : /* find ';' within reasonable range */
147 147 : size_t j = i + 1;
148 620 : while (j < len && in[j] != ';' && j - i < 14) j++;
149 147 : if (j < len && in[j] == ';' && j > i + 1) {
150 147 : const char *e = in + i + 1;
151 147 : size_t el = j - i - 1;
152 147 : unsigned cp = 0; int matched = 0;
153 147 : if (el > 0 && e[0] == '#') {
154 7 : if (el > 1 && (e[1]=='x'||e[1]=='X')) {
155 27 : for (size_t k=2;k<el;k++) {
156 21 : unsigned char c=(unsigned char)e[k];
157 21 : if (c>='0'&&c<='9') cp=cp*16+(c-'0');
158 3 : else if (c>='a'&&c<='f') cp=cp*16+(c-'a'+10);
159 2 : else if (c>='A'&&c<='F') cp=cp*16+(c-'A'+10);
160 : }
161 : } else {
162 3 : for (size_t k=1;k<el;k++)
163 2 : if (e[k]>='0'&&e[k]<='9') cp=cp*10+(unsigned char)(e[k]-'0');
164 : }
165 7 : matched = (cp > 0);
166 140 : } else if (el < 10) {
167 140 : char nm[10]={0}; memcpy(nm,e,el);
168 2157 : for (int k=0; ENTITIES[k].name; k++)
169 2156 : if (strcmp(ENTITIES[k].name,nm)==0) { cp=ENTITIES[k].cp; matched=1; break; }
170 : }
171 147 : if (matched) {
172 146 : char utf8[5]={0}; int nb=cp_to_utf8(cp,utf8);
173 402 : for (int k=0;k<nb;k++) out[n++]=utf8[k];
174 146 : i = j + 1; continue;
175 : }
176 : }
177 1 : out[n++] = in[i++];
178 : }
179 1001 : out[n] = '\0';
180 1001 : return out;
181 : }
182 :
183 : /* ── Open-element stack ─────────────────────────────────────────────── */
184 :
185 : #define STACK_MAX 256
186 : typedef struct { HtmlNode *n[STACK_MAX]; int top; } Stack;
187 1293 : static void stk_push(Stack *s, HtmlNode *n) { if (s->top<STACK_MAX) s->n[s->top++]=n; }
188 2468 : static HtmlNode *stk_top(Stack *s) { return s->top>0 ? s->n[s->top-1] : NULL; }
189 35 : static void stk_pop(Stack *s) { if (s->top>0) s->top--; }
190 : /* Pop up to and including the nearest matching tag; return 1 if found */
191 916 : static int stk_close(Stack *s, const char *tag) {
192 992 : for (int i=s->top-1; i>=0; i--)
193 991 : if (s->n[i]->tag && strcmp(s->n[i]->tag,tag)==0) { s->top=i; return 1; }
194 1 : return 0;
195 : }
196 :
197 : /* ── Parser ─────────────────────────────────────────────────────────── */
198 :
199 : typedef enum {
200 : PS_TEXT,
201 : PS_LT, /* saw '<' */
202 : PS_BANG, /* saw '<!' */
203 : PS_CMT, /* inside <!-- --> */
204 : PS_CMT_D1, /* saw '-' inside comment */
205 : PS_CMT_DD, /* saw '--' inside comment */
206 : PS_DECL, /* inside <!DOCTYPE ...> */
207 : PS_OPEN, /* reading open tag name */
208 : PS_CLOSE, /* reading close tag name */
209 : PS_ATTR_SEP, /* between attributes */
210 : PS_ATTR_NM, /* reading attribute name */
211 : PS_ATTR_EQ, /* saw '=' */
212 : PS_ATTR_VQ, /* in quoted attribute value */
213 : PS_ATTR_VU, /* in unquoted attribute value */
214 : } PState;
215 :
216 2090 : static void tolower_buf(char *s) {
217 7683 : for (; s && *s; s++) *s = (char)tolower((unsigned char)*s);
218 2090 : }
219 :
220 : /* Flush text buffer as a text node child of current stack top */
221 2429 : static void flush_text(Stack *stk, Buf *tb) {
222 2429 : if (!tb->d || !tb->n) { buf_free(tb); return; }
223 1001 : char *decoded = decode_entities(tb->d);
224 1001 : HtmlNode *tn = node_text(decoded ? decoded : strdup(tb->d));
225 1001 : if (!decoded) free(tb->d);
226 1001 : buf_free(tb);
227 1001 : if (tn) node_add_child(stk_top(stk), tn);
228 : }
229 :
230 : /* After completing a tag build: add to tree, push if non-void */
231 1136 : static void commit_elem(Stack *stk, HtmlNode *elem, int self_close) {
232 1136 : if (!elem) return;
233 1136 : if (is_auto_close(elem->tag)) {
234 331 : HtmlNode *top = stk_top(stk);
235 331 : if (top && top->tag && strcmp(top->tag, elem->tag) == 0)
236 35 : stk_pop(stk);
237 : }
238 1136 : node_add_child(stk_top(stk), elem);
239 1136 : if (!self_close && !is_void(elem->tag))
240 1056 : stk_push(stk, elem);
241 : }
242 :
243 238 : HtmlNode *html_parse(const char *html) {
244 238 : if (!html) return NULL;
245 :
246 237 : HtmlNode *root = node_elem(strdup("__root__"));
247 237 : if (!root) return NULL;
248 :
249 237 : Stack stk; stk.top = 0;
250 237 : stk_push(&stk, root);
251 :
252 237 : PState state = PS_TEXT;
253 237 : Buf tb = {0}; /* text accumulator */
254 237 : Buf nb = {0}; /* tag/attr name */
255 237 : Buf vb = {0}; /* attr value */
256 237 : char qc = '"';
257 :
258 237 : HtmlNode *cur = NULL; /* element being built */
259 :
260 237 : const char *p = html;
261 29964 : while (*p) {
262 29728 : char c = *p;
263 :
264 29728 : switch (state) {
265 :
266 : /* ── Text content ── */
267 10232 : case PS_TEXT:
268 10232 : if (c == '<') { flush_text(&stk, &tb); state = PS_LT; }
269 8040 : else buf_push(&tb, c);
270 10232 : break;
271 :
272 : /* ── Just saw '<' ── */
273 2192 : case PS_LT:
274 2192 : if (c == '/') { state = PS_CLOSE; }
275 1276 : else if (c == '!') { state = PS_BANG; }
276 1207 : else if (isalpha((unsigned char)c) || c == '_') {
277 1174 : buf_push(&nb, c); state = PS_OPEN;
278 : } else {
279 : /* Not a real tag — treat '<' as text */
280 33 : buf_push(&tb, '<'); buf_push(&tb, c); state = PS_TEXT;
281 : }
282 2192 : break;
283 :
284 : /* ── <!... ── */
285 69 : case PS_BANG:
286 69 : if (c == '-') {
287 : /* Peek: if next is also '-' it's a comment */
288 36 : if (p[1] == '-') { p++; state = PS_CMT; }
289 1 : else state = PS_DECL;
290 33 : } else state = PS_DECL;
291 69 : break;
292 :
293 245 : case PS_CMT:
294 245 : if (c == '-') state = PS_CMT_D1;
295 245 : break;
296 69 : case PS_CMT_D1:
297 69 : if (c == '-') state = PS_CMT_DD;
298 1 : else state = PS_CMT;
299 69 : break;
300 68 : case PS_CMT_DD:
301 68 : if (c == '>') state = PS_TEXT;
302 33 : else if (c != '-') state = PS_CMT;
303 68 : break;
304 410 : case PS_DECL:
305 410 : if (c == '>') state = PS_TEXT;
306 410 : break;
307 :
308 : /* ── Close tag name ── */
309 3357 : case PS_CLOSE:
310 3357 : if (isalnum((unsigned char)c) || c == '-' || c == '_' || c == ':' || c == '.') {
311 2441 : buf_push(&nb, c);
312 916 : } else if (c == '>') {
313 916 : if (nb.d) { tolower_buf(nb.d); stk_close(&stk, nb.d); }
314 916 : buf_free(&nb);
315 916 : state = PS_TEXT;
316 : }
317 : /* Skip whitespace and other chars inside </tag > */
318 3357 : break;
319 :
320 : /* ── Open tag name ── */
321 3152 : case PS_OPEN: {
322 3152 : int done = 0;
323 3152 : if (isalnum((unsigned char)c) || c == '-' || c == '_' || c == ':' || c == '.') {
324 1978 : buf_push(&nb, c);
325 : } else {
326 1174 : done = 1;
327 : }
328 3152 : if (done && nb.d) {
329 1174 : tolower_buf(nb.d);
330 :
331 : /* script / style: skip opening tag then body entirely */
332 1174 : if (strcmp(nb.d, "script") == 0 || strcmp(nb.d, "style") == 0) {
333 : char close_tag[16];
334 38 : snprintf(close_tag, sizeof(close_tag), "</%s", nb.d);
335 38 : buf_free(&nb);
336 : /* Skip to end of opening tag */
337 38 : while (*p && *p != '>') p++;
338 38 : if (*p) p++; /* skip '>' */
339 : /* Find closing tag (case-insensitive) */
340 38 : const char *end = strcasestr(p, close_tag);
341 38 : if (end) {
342 37 : p = end + strlen(close_tag);
343 37 : while (*p && *p != '>') p++;
344 : /* p now points at '>' of closing tag */
345 : } else {
346 1 : p += strlen(p); /* skip to end */
347 1 : if (*p) {} /* p already at '\0' */
348 : }
349 38 : state = PS_TEXT;
350 38 : break;
351 : }
352 :
353 1136 : char *tname = buf_take(&nb);
354 1136 : cur = node_elem(tname);
355 1136 : if (!cur) { state = PS_TEXT; break; }
356 :
357 1136 : if (c == '>') {
358 830 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
359 306 : } else if (c == '/') {
360 1 : commit_elem(&stk, cur, 1); cur = NULL;
361 2 : while (*p && *p != '>') p++;
362 1 : state = PS_TEXT;
363 305 : } else if (isspace((unsigned char)c)) {
364 304 : state = PS_ATTR_SEP;
365 : } else {
366 : /* Attribute name starting immediately */
367 1 : buf_push(&nb, c); state = PS_ATTR_NM;
368 : }
369 : }
370 3114 : break;
371 : }
372 :
373 : /* ── Between attributes ── */
374 621 : case PS_ATTR_SEP:
375 621 : if (c == '>') {
376 237 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
377 384 : } else if (c == '/') {
378 1 : commit_elem(&stk, cur, 1); cur = NULL;
379 2 : while (*p && *p != '>') p++;
380 1 : state = PS_TEXT;
381 383 : } else if (!isspace((unsigned char)c)) {
382 344 : buf_push(&nb, c); state = PS_ATTR_NM;
383 : }
384 621 : break;
385 :
386 : /* ── Attribute name ── */
387 1672 : case PS_ATTR_NM:
388 1672 : if (c == '=') { state = PS_ATTR_EQ; }
389 1362 : else if (c == '>') {
390 : /* Boolean attr */
391 34 : if (cur && nb.d && nb.n > 0)
392 34 : attr_prepend(cur, buf_take(&nb), strdup(""));
393 0 : else buf_free(&nb);
394 34 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
395 1328 : } else if (isspace((unsigned char)c)) {
396 1 : if (cur && nb.d && nb.n > 0)
397 1 : attr_prepend(cur, buf_take(&nb), strdup(""));
398 0 : else buf_free(&nb);
399 1 : state = PS_ATTR_SEP;
400 : } else {
401 1327 : buf_push(&nb, c);
402 : }
403 1672 : break;
404 :
405 : /* ── After '=' ── */
406 310 : case PS_ATTR_EQ:
407 310 : if (c == '"' || c == '\'') { qc = c; state = PS_ATTR_VQ; }
408 34 : else if (!isspace((unsigned char)c)) { buf_push(&vb, c); state = PS_ATTR_VU; }
409 310 : break;
410 :
411 : /* ── Quoted attribute value ── */
412 7227 : case PS_ATTR_VQ:
413 7227 : if (c == qc) {
414 276 : if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
415 0 : else { buf_free(&nb); buf_free(&vb); }
416 276 : state = PS_ATTR_SEP;
417 6951 : } else buf_push(&vb, c);
418 7227 : break;
419 :
420 : /* ── Unquoted attribute value ── */
421 104 : case PS_ATTR_VU:
422 104 : if (isspace((unsigned char)c)) {
423 1 : if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
424 0 : else { buf_free(&nb); buf_free(&vb); }
425 1 : state = PS_ATTR_SEP;
426 103 : } else if (c == '>') {
427 33 : if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
428 0 : else { buf_free(&nb); buf_free(&vb); }
429 33 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
430 70 : } else buf_push(&vb, c);
431 104 : break;
432 :
433 : } /* switch */
434 :
435 29728 : if (*p) p++;
436 1 : else break;
437 : } /* while */
438 :
439 237 : flush_text(&stk, &tb);
440 237 : if (cur) html_node_free(cur); /* partially built element */
441 237 : buf_free(&nb); buf_free(&vb);
442 237 : return root;
443 : }
|