Line data Source code
1 : #include "html_parser.h"
2 : #include <stdio.h>
3 : #include <stdlib.h>
4 : #include <string.h>
5 : #include <ctype.h>
6 :
7 : /* ── Void elements (never have children) ───────────────────────────── */
8 :
9 : static const char * const VOID_TAGS[] = {
10 : "area","base","br","col","embed","hr","img","input",
11 : "link","meta","param","source","track","wbr", NULL
12 : };
13 317 : static int is_void(const char *t) {
14 317 : if (!t) return 0;
15 4255 : for (int i = 0; VOID_TAGS[i]; i++)
16 3984 : if (strcmp(t, VOID_TAGS[i]) == 0) return 1;
17 271 : return 0;
18 : }
19 :
20 : /* Auto-close: opening one of these implicitly closes the previous sibling */
21 : static const char * const AUTO_CLOSE_TAGS[] = {
22 : "li","p","dt","dd","tr","td","th", NULL
23 : };
24 319 : static int is_auto_close(const char *t) {
25 319 : if (!t) return 0;
26 2372 : for (int i = 0; AUTO_CLOSE_TAGS[i]; i++)
27 2096 : if (strcmp(t, AUTO_CLOSE_TAGS[i]) == 0) return 1;
28 276 : return 0;
29 : }
30 :
31 : /* ── Dynamic string ─────────────────────────────────────────────────── */
32 :
33 : typedef struct { char *d; size_t n, cap; } Buf;
34 :
35 5851 : static int buf_push(Buf *b, char c) {
36 5851 : if (b->n + 1 >= b->cap) {
37 1062 : size_t nc = b->cap ? b->cap * 2 : 64;
38 1062 : char *t = realloc(b->d, nc);
39 1062 : if (!t) return 0;
40 1062 : b->d = t; b->cap = nc;
41 : }
42 5851 : b->d[b->n++] = c;
43 5851 : b->d[b->n] = '\0';
44 5851 : return 1;
45 : }
46 556 : static char *buf_take(Buf *b) {
47 556 : char *r = b->d; b->d = NULL; b->n = b->cap = 0; return r;
48 : }
49 1341 : static void buf_free(Buf *b) { free(b->d); b->d = NULL; b->n = b->cap = 0; }
50 :
51 : /* ── Node helpers ───────────────────────────────────────────────────── */
52 :
53 521 : static HtmlNode *node_elem(char *tag) {
54 521 : HtmlNode *n = calloc(1, sizeof(*n));
55 521 : if (!n) { free(tag); return NULL; }
56 521 : n->type = HTML_NODE_ELEMENT; n->tag = tag; return n;
57 : }
58 298 : static HtmlNode *node_text(char *text) {
59 298 : HtmlNode *n = calloc(1, sizeof(*n));
60 298 : if (!n) { free(text); return NULL; }
61 298 : n->type = HTML_NODE_TEXT; n->text = text; return n;
62 : }
63 819 : static void attr_free_list(HtmlAttr *a) {
64 939 : while (a) { HtmlAttr *nx = a->next; free(a->name); free(a->value); free(a); a = nx; }
65 819 : }
66 1841 : void html_node_free(HtmlNode *node) {
67 1841 : if (!node) return;
68 819 : html_node_free(node->first_child);
69 819 : html_node_free(node->next_sibling);
70 819 : attr_free_list(node->attrs);
71 819 : free(node->tag); free(node->text); free(node);
72 : }
73 329 : const char *html_attr_get(const HtmlNode *node, const char *name) {
74 329 : if (!node || !name) return NULL;
75 349 : for (HtmlAttr *a = node->attrs; a; a = a->next)
76 140 : if (strcmp(a->name, name) == 0) return a->value;
77 209 : return NULL;
78 : }
79 617 : static void node_add_child(HtmlNode *parent, HtmlNode *child) {
80 617 : if (!parent || !child) return;
81 617 : child->parent = parent;
82 617 : if (!parent->first_child) { parent->first_child = child; return; }
83 160 : HtmlNode *last = parent->first_child;
84 657 : while (last->next_sibling) last = last->next_sibling;
85 160 : last->next_sibling = child;
86 : }
87 120 : static void attr_prepend(HtmlNode *elem, char *name, char *value) {
88 120 : HtmlAttr *a = calloc(1, sizeof(*a));
89 120 : if (!a) { free(name); free(value); return; }
90 120 : a->name = name; a->value = value; a->next = elem->attrs; elem->attrs = a;
91 : }
92 :
93 : /* ── Entity decoder ─────────────────────────────────────────────────── */
94 :
95 : static const struct { const char *name; unsigned cp; } ENTITIES[] = {
96 : {"amp",'&'},{"lt",'<'},{"gt",'>'},{"quot",'"'},{"apos",'\''},
97 : {"nbsp",0xA0},{"copy",0xA9},{"reg",0xAE},{"mdash",0x2014},
98 : {"ndash",0x2013},{"hellip",0x2026},{"laquo",0xAB},{"raquo",0xBB},
99 : /* zero-width / formatting */
100 : {"zwnj",0x200C},{"zwj",0x200D},{"shy",0xAD},
101 : /* common symbols */
102 : {"bull",0x2022},{"middot",0xB7},{"trade",0x2122},{"euro",0x20AC},
103 : {"pound",0xA3},{"cent",0xA2},{"yen",0xA5},
104 : {"times",0xD7},{"divide",0xF7},
105 : {NULL,0}
106 : };
107 13 : static int cp_to_utf8(unsigned cp, char *out) {
108 13 : if (cp < 0x80) { out[0]=(char)cp; return 1; }
109 7 : if (cp < 0x800) { out[0]=(char)(0xC0|(cp>>6)); out[1]=(char)(0x80|(cp&0x3F)); return 2; }
110 4 : if (cp < 0x10000) {
111 2 : out[0]=(char)(0xE0|(cp>>12)); out[1]=(char)(0x80|((cp>>6)&0x3F));
112 2 : out[2]=(char)(0x80|(cp&0x3F)); return 3;
113 : }
114 2 : if (cp <= 0x10FFFF) {
115 1 : out[0]=(char)(0xF0|(cp>>18)); out[1]=(char)(0x80|((cp>>12)&0x3F));
116 1 : out[2]=(char)(0x80|((cp>>6)&0x3F)); out[3]=(char)(0x80|(cp&0x3F)); return 4;
117 : }
118 1 : return 0;
119 : }
120 298 : static char *decode_entities(const char *in) {
121 298 : if (!in) return NULL;
122 298 : size_t len = strlen(in);
123 298 : char *out = malloc(len * 4 + 1);
124 298 : if (!out) return NULL;
125 298 : size_t n = 0;
126 1941 : for (size_t i = 0; i < len; ) {
127 1643 : if (in[i] != '&') { out[n++] = in[i++]; continue; }
128 : /* find ';' within reasonable range */
129 14 : size_t j = i + 1;
130 75 : while (j < len && in[j] != ';' && j - i < 14) j++;
131 14 : if (j < len && in[j] == ';' && j > i + 1) {
132 14 : const char *e = in + i + 1;
133 14 : size_t el = j - i - 1;
134 14 : unsigned cp = 0; int matched = 0;
135 14 : if (el > 0 && e[0] == '#') {
136 7 : if (el > 1 && (e[1]=='x'||e[1]=='X')) {
137 27 : for (size_t k=2;k<el;k++) {
138 21 : unsigned char c=(unsigned char)e[k];
139 21 : if (c>='0'&&c<='9') cp=cp*16+(c-'0');
140 3 : else if (c>='a'&&c<='f') cp=cp*16+(c-'a'+10);
141 2 : else if (c>='A'&&c<='F') cp=cp*16+(c-'A'+10);
142 : }
143 : } else {
144 3 : for (size_t k=1;k<el;k++)
145 2 : if (e[k]>='0'&&e[k]<='9') cp=cp*10+(unsigned char)(e[k]-'0');
146 : }
147 7 : matched = (cp > 0);
148 7 : } else if (el < 10) {
149 7 : char nm[10]={0}; memcpy(nm,e,el);
150 53 : for (int k=0; ENTITIES[k].name; k++)
151 52 : if (strcmp(ENTITIES[k].name,nm)==0) { cp=ENTITIES[k].cp; matched=1; break; }
152 : }
153 14 : if (matched) {
154 13 : char utf8[5]={0}; int nb=cp_to_utf8(cp,utf8);
155 35 : for (int k=0;k<nb;k++) out[n++]=utf8[k];
156 13 : i = j + 1; continue;
157 : }
158 : }
159 1 : out[n++] = in[i++];
160 : }
161 298 : out[n] = '\0';
162 298 : return out;
163 : }
164 :
165 : /* ── Open-element stack ─────────────────────────────────────────────── */
166 :
167 : #define STACK_MAX 256
168 : typedef struct { HtmlNode *n[STACK_MAX]; int top; } Stack;
169 473 : static void stk_push(Stack *s, HtmlNode *n) { if (s->top<STACK_MAX) s->n[s->top++]=n; }
170 660 : static HtmlNode *stk_top(Stack *s) { return s->top>0 ? s->n[s->top-1] : NULL; }
171 3 : static void stk_pop(Stack *s) { if (s->top>0) s->top--; }
172 : /* Pop up to and including the nearest matching tag; return 1 if found */
173 195 : static int stk_close(Stack *s, const char *tag) {
174 239 : for (int i=s->top-1; i>=0; i--)
175 238 : if (s->n[i]->tag && strcmp(s->n[i]->tag,tag)==0) { s->top=i; return 1; }
176 1 : return 0;
177 : }
178 :
179 : /* ── Parser ─────────────────────────────────────────────────────────── */
180 :
181 : typedef enum {
182 : PS_TEXT,
183 : PS_LT, /* saw '<' */
184 : PS_BANG, /* saw '<!' */
185 : PS_CMT, /* inside <!-- --> */
186 : PS_CMT_D1, /* saw '-' inside comment */
187 : PS_CMT_DD, /* saw '--' inside comment */
188 : PS_DECL, /* inside <!DOCTYPE ...> */
189 : PS_OPEN, /* reading open tag name */
190 : PS_CLOSE, /* reading close tag name */
191 : PS_ATTR_SEP, /* between attributes */
192 : PS_ATTR_NM, /* reading attribute name */
193 : PS_ATTR_EQ, /* saw '=' */
194 : PS_ATTR_VQ, /* in quoted attribute value */
195 : PS_ATTR_VU, /* in unquoted attribute value */
196 : } PState;
197 :
198 524 : static void tolower_buf(char *s) {
199 2015 : for (; s && *s; s++) *s = (char)tolower((unsigned char)*s);
200 524 : }
201 :
202 : /* Flush text buffer as a text node child of current stack top */
203 732 : static void flush_text(Stack *stk, Buf *tb) {
204 732 : if (!tb->d || !tb->n) { buf_free(tb); return; }
205 298 : char *decoded = decode_entities(tb->d);
206 298 : HtmlNode *tn = node_text(decoded ? decoded : strdup(tb->d));
207 298 : if (!decoded) free(tb->d);
208 298 : buf_free(tb);
209 298 : if (tn) node_add_child(stk_top(stk), tn);
210 : }
211 :
212 : /* After completing a tag build: add to tree, push if non-void */
213 319 : static void commit_elem(Stack *stk, HtmlNode *elem, int self_close) {
214 319 : if (!elem) return;
215 319 : if (is_auto_close(elem->tag)) {
216 43 : HtmlNode *top = stk_top(stk);
217 43 : if (top && top->tag && strcmp(top->tag, elem->tag) == 0)
218 3 : stk_pop(stk);
219 : }
220 319 : node_add_child(stk_top(stk), elem);
221 319 : if (!self_close && !is_void(elem->tag))
222 271 : stk_push(stk, elem);
223 : }
224 :
225 203 : HtmlNode *html_parse(const char *html) {
226 203 : if (!html) return NULL;
227 :
228 202 : HtmlNode *root = node_elem(strdup("__root__"));
229 202 : if (!root) return NULL;
230 :
231 202 : Stack stk; stk.top = 0;
232 202 : stk_push(&stk, root);
233 :
234 202 : PState state = PS_TEXT;
235 202 : Buf tb = {0}; /* text accumulator */
236 202 : Buf nb = {0}; /* tag/attr name */
237 202 : Buf vb = {0}; /* attr value */
238 202 : char qc = '"';
239 :
240 202 : HtmlNode *cur = NULL; /* element being built */
241 :
242 202 : const char *p = html;
243 7833 : while (*p) {
244 7632 : char c = *p;
245 :
246 7632 : switch (state) {
247 :
248 : /* ── Text content ── */
249 2238 : case PS_TEXT:
250 2238 : if (c == '<') { flush_text(&stk, &tb); state = PS_LT; }
251 1708 : else buf_push(&tb, c);
252 2238 : break;
253 :
254 : /* ── Just saw '<' ── */
255 530 : case PS_LT:
256 530 : if (c == '/') { state = PS_CLOSE; }
257 335 : else if (c == '!') { state = PS_BANG; }
258 330 : else if (isalpha((unsigned char)c) || c == '_') {
259 329 : buf_push(&nb, c); state = PS_OPEN;
260 : } else {
261 : /* Not a real tag — treat '<' as text */
262 1 : buf_push(&tb, '<'); buf_push(&tb, c); state = PS_TEXT;
263 : }
264 530 : break;
265 :
266 : /* ── <!... ── */
267 5 : case PS_BANG:
268 5 : if (c == '-') {
269 : /* Peek: if next is also '-' it's a comment */
270 4 : if (p[1] == '-') { p++; state = PS_CMT; }
271 1 : else state = PS_DECL;
272 1 : } else state = PS_DECL;
273 5 : break;
274 :
275 21 : case PS_CMT:
276 21 : if (c == '-') state = PS_CMT_D1;
277 21 : break;
278 5 : case PS_CMT_D1:
279 5 : if (c == '-') state = PS_CMT_DD;
280 1 : else state = PS_CMT;
281 5 : break;
282 4 : case PS_CMT_DD:
283 4 : if (c == '>') state = PS_TEXT;
284 1 : else if (c != '-') state = PS_CMT;
285 4 : break;
286 26 : case PS_DECL:
287 26 : if (c == '>') state = PS_TEXT;
288 26 : break;
289 :
290 : /* ── Close tag name ── */
291 767 : case PS_CLOSE:
292 767 : if (isalnum((unsigned char)c) || c == '-' || c == '_' || c == ':' || c == '.') {
293 572 : buf_push(&nb, c);
294 195 : } else if (c == '>') {
295 195 : if (nb.d) { tolower_buf(nb.d); stk_close(&stk, nb.d); }
296 195 : buf_free(&nb);
297 195 : state = PS_TEXT;
298 : }
299 : /* Skip whitespace and other chars inside </tag > */
300 767 : break;
301 :
302 : /* ── Open tag name ── */
303 919 : case PS_OPEN: {
304 919 : int done = 0;
305 919 : if (isalnum((unsigned char)c) || c == '-' || c == '_' || c == ':' || c == '.') {
306 590 : buf_push(&nb, c);
307 : } else {
308 329 : done = 1;
309 : }
310 919 : if (done && nb.d) {
311 329 : tolower_buf(nb.d);
312 :
313 : /* script / style: skip opening tag then body entirely */
314 329 : if (strcmp(nb.d, "script") == 0 || strcmp(nb.d, "style") == 0) {
315 10 : char close_tag[16];
316 10 : snprintf(close_tag, sizeof(close_tag), "</%s", nb.d);
317 10 : buf_free(&nb);
318 : /* Skip to end of opening tag */
319 10 : while (*p && *p != '>') p++;
320 10 : if (*p) p++; /* skip '>' */
321 : /* Find closing tag (case-insensitive) */
322 10 : const char *end = strcasestr(p, close_tag);
323 10 : if (end) {
324 9 : p = end + strlen(close_tag);
325 9 : while (*p && *p != '>') p++;
326 : /* p now points at '>' of closing tag */
327 : } else {
328 1 : p += strlen(p); /* skip to end */
329 1 : if (*p) {} /* p already at '\0' */
330 : }
331 10 : state = PS_TEXT;
332 10 : break;
333 : }
334 :
335 319 : char *tname = buf_take(&nb);
336 319 : cur = node_elem(tname);
337 319 : if (!cur) { state = PS_TEXT; break; }
338 :
339 319 : if (c == '>') {
340 206 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
341 113 : } else if (c == '/') {
342 1 : commit_elem(&stk, cur, 1); cur = NULL;
343 2 : while (*p && *p != '>') p++;
344 1 : state = PS_TEXT;
345 112 : } else if (isspace((unsigned char)c)) {
346 111 : state = PS_ATTR_SEP;
347 : } else {
348 : /* Attribute name starting immediately */
349 1 : buf_push(&nb, c); state = PS_ATTR_NM;
350 : }
351 : }
352 909 : break;
353 : }
354 :
355 : /* ── Between attributes ── */
356 235 : case PS_ATTR_SEP:
357 235 : if (c == '>') {
358 108 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
359 127 : } else if (c == '/') {
360 1 : commit_elem(&stk, cur, 1); cur = NULL;
361 2 : while (*p && *p != '>') p++;
362 1 : state = PS_TEXT;
363 126 : } else if (!isspace((unsigned char)c)) {
364 119 : buf_push(&nb, c); state = PS_ATTR_NM;
365 : }
366 235 : break;
367 :
368 : /* ── Attribute name ── */
369 580 : case PS_ATTR_NM:
370 580 : if (c == '=') { state = PS_ATTR_EQ; }
371 463 : else if (c == '>') {
372 : /* Boolean attr */
373 2 : if (cur && nb.d && nb.n > 0)
374 2 : attr_prepend(cur, buf_take(&nb), strdup(""));
375 0 : else buf_free(&nb);
376 2 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
377 461 : } else if (isspace((unsigned char)c)) {
378 1 : if (cur && nb.d && nb.n > 0)
379 1 : attr_prepend(cur, buf_take(&nb), strdup(""));
380 0 : else buf_free(&nb);
381 1 : state = PS_ATTR_SEP;
382 : } else {
383 460 : buf_push(&nb, c);
384 : }
385 580 : break;
386 :
387 : /* ── After '=' ── */
388 117 : case PS_ATTR_EQ:
389 117 : if (c == '"' || c == '\'') { qc = c; state = PS_ATTR_VQ; }
390 2 : else if (!isspace((unsigned char)c)) { buf_push(&vb, c); state = PS_ATTR_VU; }
391 117 : break;
392 :
393 : /* ── Quoted attribute value ── */
394 2177 : case PS_ATTR_VQ:
395 2177 : if (c == qc) {
396 115 : if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
397 0 : else { buf_free(&nb); buf_free(&vb); }
398 115 : state = PS_ATTR_SEP;
399 2062 : } else buf_push(&vb, c);
400 2177 : break;
401 :
402 : /* ── Unquoted attribute value ── */
403 8 : case PS_ATTR_VU:
404 8 : if (isspace((unsigned char)c)) {
405 1 : if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
406 0 : else { buf_free(&nb); buf_free(&vb); }
407 1 : state = PS_ATTR_SEP;
408 7 : } else if (c == '>') {
409 1 : if (cur) attr_prepend(cur, buf_take(&nb), buf_take(&vb));
410 0 : else { buf_free(&nb); buf_free(&vb); }
411 1 : commit_elem(&stk, cur, 0); cur = NULL; state = PS_TEXT;
412 6 : } else buf_push(&vb, c);
413 8 : break;
414 :
415 : } /* switch */
416 :
417 7632 : if (*p) p++;
418 1 : else break;
419 : } /* while */
420 :
421 202 : flush_text(&stk, &tb);
422 202 : if (cur) html_node_free(cur); /* partially built element */
423 202 : buf_free(&nb); buf_free(&vb);
424 202 : return root;
425 : }
|