66 #include "html-strings.h"
69 #include "htmlparser.h"
75 #define PRINTF(x) printf x
93 #define ISO_a (ISO_A | 0x20)
94 #define ISO_b (ISO_B | 0x20)
95 #define ISO_e (ISO_E | 0x20)
96 #define ISO_f (ISO_F | 0x20)
97 #define ISO_g (ISO_G | 0x20)
98 #define ISO_h (ISO_H | 0x20)
99 #define ISO_i (ISO_I | 0x20)
100 #define ISO_l (ISO_L | 0x20)
101 #define ISO_m (ISO_M | 0x20)
102 #define ISO_p (ISO_P | 0x20)
103 #define ISO_r (ISO_R | 0x20)
104 #define ISO_t (ISO_T | 0x20)
109 #define ISO_space 0x20
110 #define ISO_bang 0x21
111 #define ISO_citation 0x22
112 #define ISO_ampersand 0x26
113 #define ISO_citation2 0x27
114 #define ISO_asterisk 0x2a
115 #define ISO_dash 0x2d
116 #define ISO_slash 0x2f
117 #define ISO_semicolon 0x3b
122 #define ISO_rbrack 0x5b
123 #define ISO_lbrack 0x5d
125 #define MINORSTATE_NONE 0
126 #define MINORSTATE_TEXT 1
127 #define MINORSTATE_EXTCHAR 2
128 #define MINORSTATE_TAG 3
129 #define MINORSTATE_TAGEND 4
130 #define MINORSTATE_TAGATTR 5
131 #define MINORSTATE_TAGATTRSPACE 6
133 #define MINORSTATE_TAGATTRPARAM 7
134 #define MINORSTATE_TAGATTRPARAMNQ 8
136 #define MINORSTATE_HTMLCOMMENT 9
138 #define MAJORSTATE_NONE 0
139 #define MAJORSTATE_BODY 1
140 #define MAJORSTATE_LINK 2
141 #define MAJORSTATE_FORM 3
142 #define MAJORSTATE_DISCARD 4
145 struct htmlparser_state {
147 unsigned char minorstate;
149 unsigned char tagptr;
151 unsigned char tagattrptr;
152 char tagattrparam[WWW_CONF_MAX_URLLEN + 1];
153 unsigned char tagattrparamptr;
154 unsigned char lastchar, quotechar;
155 unsigned char majorstate, lastmajorstate;
156 char linkurl[WWW_CONF_MAX_URLLEN + 1];
158 char word[WWW_CONF_WEBPAGE_WIDTH];
159 unsigned char wordlen;
162 char formaction[WWW_CONF_MAX_FORMACTIONLEN + 1];
163 unsigned char inputtype;
164 char inputname[WWW_CONF_MAX_INPUTNAMELEN + 1];
165 char inputvalue[WWW_CONF_MAX_INPUTVALUELEN + 1];
166 unsigned char inputvaluesize;
170 static struct htmlparser_state s;
173 static char last[1] = {(char)0xff};
175 static const char *tags[] = {
179 #define TAG_SLASHDIV 1
181 #define TAG_SLASHFORM 2
185 #define TAG_SLASHSCRIPT 4
187 #define TAG_SLASHSELECT 5
189 #define TAG_SLASHSTYLE 6
217 #define TAG_SCRIPT 20
219 #define TAG_SELECT 21
233 return (c == ISO_space ||
243 s.inputtype = HTMLPARSER_INPUTTYPE_NONE;
244 s.inputname[0] = s.inputvalue[0] =
245 s.formaction[WWW_CONF_MAX_FORMACTIONLEN] =
246 s.inputname[WWW_CONF_MAX_INPUTNAMELEN] =
247 s.inputvalue[WWW_CONF_MAX_INPUTVALUELEN] = 0;
248 s.inputvaluesize = 20;
253 htmlparser_init(
void)
255 s.majorstate = s.lastmajorstate = MAJORSTATE_DISCARD;
256 s.minorstate = MINORSTATE_TEXT;
270 return (c & 0x1f) | 0x60;
280 s.tagattr[s.tagattrptr] = 0;
281 s.tagattrparam[s.tagattrparamptr] = 0;
285 switch_majorstate(
unsigned char newstate)
287 if(s.majorstate != newstate) {
288 PRINTF((
"Switching state from %d to %d (%d)\n", s.majorstate, newstate, s.lastmajorstate));
289 s.lastmajorstate = s.majorstate;
290 s.majorstate = newstate;
295 add_char(
unsigned char c)
297 if(s.wordlen < WWW_CONF_WEBPAGE_WIDTH - 1 && c < 0x80) {
298 s.word[s.wordlen] = c;
307 if(s.majorstate == MAJORSTATE_LINK) {
308 if(s.word[s.wordlen] != ISO_space) {
311 }
else if(s.majorstate == MAJORSTATE_DISCARD) {
314 s.word[s.wordlen] =
'\0';
315 htmlparser_word(s.word, s.wordlen);
325 htmlparser_newline();
331 static unsigned char first, last, i, tabi;
341 if((tagc == 0 || tagc == ISO_slash) && tags[first][i] == 0) {
348 while(tagc > (tags[tabi])[i] && tabi < last) {
354 while(tagc == (tags[tabi])[i] && tabi < last) {
363 }
while(last != first);
370 static char *tagattrparam;
371 static unsigned char size;
375 PRINTF((
"Parsing tag '%s' '%s' '%s'\n", s.tag, s.tagattr, s.tagattrparam));
377 switch(find_tag(s.tag)) {
394 add_char(ISO_asterisk);
400 switch_majorstate(MAJORSTATE_DISCARD);
402 case TAG_SLASHSCRIPT:
404 case TAG_SLASHSELECT:
406 switch_majorstate(s.lastmajorstate);
409 s.majorstate = s.lastmajorstate = MAJORSTATE_BODY;
412 if(strncmp(s.tagattr, html_src,
sizeof(html_src)) == 0 && s.tagattrparam[0] != 0) {
413 switch_majorstate(MAJORSTATE_BODY);
415 add_char(ISO_rbrack);
417 htmlparser_link((
char *)html_frame, (
unsigned char)strlen(html_frame), s.tagattrparam);
418 PRINTF((
"Frame [%s]\n", s.tagattrparam));
419 add_char(ISO_lbrack);
424 if(strncmp(s.tagattr, html_alt,
sizeof(html_alt)) == 0 && s.tagattrparam[0] != 0) {
426 tagattrparam = &s.tagattrparam[0];
427 while(*tagattrparam) {
428 add_char(*tagattrparam);
436 PRINTF((
"A %s %s\n", s.tagattr, s.tagattrparam));
437 if(strncmp(s.tagattr, html_href,
sizeof(html_href)) == 0 && s.tagattrparam[0] != 0) {
438 strcpy(s.linkurl, s.tagattrparam);
440 switch_majorstate(MAJORSTATE_LINK);
444 if(s.majorstate == MAJORSTATE_LINK) {
445 switch_majorstate(s.lastmajorstate);
446 s.word[s.wordlen] = 0;
447 htmlparser_link(s.word, s.wordlen, s.linkurl);
455 if(s.tagattr[0] == 0 && s.formaction[0] != 0) {
456 htmlparser_form(s.formaction);
459 PRINTF((
"Form tag\n"));
460 switch_majorstate(MAJORSTATE_FORM);
461 if(strncmp(s.tagattr, html_action,
sizeof(html_action)) == 0) {
462 PRINTF((
"Form action '%s'\n", s.tagattrparam));
463 strncpy(s.formaction, s.tagattrparam, WWW_CONF_MAX_FORMACTIONLEN - 1);
468 switch_majorstate(MAJORSTATE_BODY);
472 if(s.majorstate == MAJORSTATE_FORM) {
475 if(s.tagattr[0] == 0 && s.inputname[0] != 0) {
476 PRINTF((
"Render input type %d\n", s.inputtype));
477 switch(s.inputtype) {
478 case HTMLPARSER_INPUTTYPE_NONE:
479 case HTMLPARSER_INPUTTYPE_TEXT:
480 case HTMLPARSER_INPUTTYPE_HIDDEN:
481 htmlparser_inputfield(s.inputtype, s.inputvaluesize, s.inputvalue, s.inputname);
483 case HTMLPARSER_INPUTTYPE_SUBMIT:
484 case HTMLPARSER_INPUTTYPE_IMAGE:
485 htmlparser_submitbutton(s.inputvalue, s.inputname);
490 PRINTF((
"Input '%s' '%s'\n", s.tagattr, s.tagattrparam));
491 if(strncmp(s.tagattr, html_type,
sizeof(html_type)) == 0) {
492 if(strncmp(s.tagattrparam, html_submit,
sizeof(html_submit)) == 0) {
493 s.inputtype = HTMLPARSER_INPUTTYPE_SUBMIT;
494 }
else if(strncmp(s.tagattrparam, html_image,
sizeof(html_image)) == 0) {
495 s.inputtype = HTMLPARSER_INPUTTYPE_IMAGE;
496 }
else if(strncmp(s.tagattrparam, html_text,
sizeof(html_text)) == 0) {
497 s.inputtype = HTMLPARSER_INPUTTYPE_TEXT;
498 }
else if(strncmp(s.tagattrparam, html_hidden,
sizeof(html_hidden)) == 0) {
499 s.inputtype = HTMLPARSER_INPUTTYPE_HIDDEN;
501 s.inputtype = HTMLPARSER_INPUTTYPE_OTHER;
503 }
else if(strncmp(s.tagattr, html_name,
sizeof(html_name)) == 0) {
504 strncpy(s.inputname, s.tagattrparam, WWW_CONF_MAX_INPUTNAMELEN);
505 }
else if(strncmp(s.tagattr, html_alt,
sizeof(html_alt)) == 0 &&
506 s.inputtype == HTMLPARSER_INPUTTYPE_IMAGE) {
507 strncpy(s.inputvalue, s.tagattrparam, WWW_CONF_MAX_INPUTVALUELEN);
508 }
else if(strncmp(s.tagattr, html_value,
sizeof(html_value)) == 0) {
509 strncpy(s.inputvalue, s.tagattrparam, WWW_CONF_MAX_INPUTVALUELEN);
510 }
else if(strncmp(s.tagattr, html_size,
sizeof(html_size)) == 0) {
512 if(s.tagattrparam[0] >=
'0' &&
513 s.tagattrparam[0] <=
'9') {
514 size = s.tagattrparam[0] -
'0';
515 if(s.tagattrparam[1] >=
'0' &&
516 s.tagattrparam[1] <=
'9') {
517 size = size * 10 + (s.tagattrparam[1] -
'0');
520 if(size >= WWW_CONF_MAX_INPUTVALUELEN) {
521 size = WWW_CONF_MAX_INPUTVALUELEN - 1;
523 s.inputvaluesize = size;
533 parse_word(
char *data, uint8_t dlen)
541 switch(s.minorstate) {
542 case MINORSTATE_TEXT:
543 for(i = 0; i < len; ++i) {
545 if(iswhitespace(c)) {
547 }
else if(c == ISO_lt) {
548 s.minorstate = MINORSTATE_TAG;
551 }
else if(c == ISO_ampersand) {
552 s.minorstate = MINORSTATE_EXTCHAR;
559 case MINORSTATE_EXTCHAR:
560 for(i = 0; i < len; ++i) {
562 if(c == ISO_semicolon) {
563 s.minorstate = MINORSTATE_TEXT;
566 }
else if(iswhitespace(c)) {
567 s.minorstate = MINORSTATE_TEXT;
579 for(i = 0; i < len; ++i) {
583 s.minorstate = MINORSTATE_TEXT;
584 s.tagattrptr = s.tagattrparamptr = 0;
588 }
else if(iswhitespace(c)) {
591 s.minorstate = MINORSTATE_TAGATTR;
598 s.tag[s.tagptr] = lowercase(c);
602 if(s.tagptr ==
sizeof(s.tag)) {
603 s.minorstate = MINORSTATE_TAGEND;
610 s.tag[0] == ISO_bang &&
611 s.tag[1] == ISO_dash &&
612 s.tag[2] == ISO_dash) {
613 PRINTF((
"Starting comment...\n"));
614 s.minorstate = MINORSTATE_HTMLCOMMENT;
621 case MINORSTATE_TAGATTR:
624 for(i = 0; i < len; ++i) {
628 s.minorstate = MINORSTATE_TEXT;
629 s.tagattrparamptr = 0;
636 }
else if(iswhitespace(c)) {
637 if(s.tagattrptr == 0) {
641 s.tagattrparamptr = 0;
644 s.minorstate = MINORSTATE_TAGATTRSPACE;
647 }
else if(c == ISO_eq) {
648 s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
649 s.tagattrparamptr = 0;
653 s.tagattr[s.tagattrptr] = lowercase(c);
657 if(s.tagattrptr ==
sizeof(s.tagattr)) {
658 s.minorstate = MINORSTATE_TAGEND;
664 case MINORSTATE_TAGATTRSPACE:
665 for(i = 0; i < len; ++i) {
667 if(iswhitespace(c)) {
669 }
else if(c == ISO_eq) {
670 s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
671 s.tagattrparamptr = 0;
676 s.tagattr[0] = lowercase(c);
678 s.minorstate = MINORSTATE_TAGATTR;
683 case MINORSTATE_TAGATTRPARAMNQ:
686 for(i = 0; i < len; ++i) {
692 s.minorstate = MINORSTATE_TEXT;
699 }
else if(iswhitespace(c) && s.tagattrparamptr == 0) {
701 }
else if((c == ISO_citation ||
702 c == ISO_citation2) && s.tagattrparamptr == 0) {
703 s.minorstate = MINORSTATE_TAGATTRPARAM;
705 PRINTF((
"tag attr param q found\n"));
707 }
else if(iswhitespace(c)) {
708 PRINTF((
"Non-leading space found at %d\n", s.tagattrparamptr));
713 s.minorstate = MINORSTATE_TAGATTR;
718 s.tagattrparam[s.tagattrparamptr] = c;
722 if(s.tagattrparamptr >=
sizeof(s.tagattrparam) - 1) {
723 s.minorstate = MINORSTATE_TAGEND;
729 case MINORSTATE_TAGATTRPARAM:
732 for(i = 0; i < len; ++i) {
734 if(c == s.quotechar) {
739 s.minorstate = MINORSTATE_TAGATTR;
744 if(iswhitespace(c)) {
745 s.tagattrparam[s.tagattrparamptr] = ISO_space;
747 s.tagattrparam[s.tagattrparamptr] = c;
753 if(s.tagattrparamptr >=
sizeof(s.tagattrparam) - 1) {
754 s.minorstate = MINORSTATE_TAGEND;
760 case MINORSTATE_HTMLCOMMENT:
761 for(i = 0; i < len; ++i) {
765 }
else if(c == ISO_gt && s.tagptr > 0) {
766 PRINTF((
"Comment done.\n"));
767 s.minorstate = MINORSTATE_TEXT;
774 case MINORSTATE_TAGEND:
776 for(i = 0; i < len; ++i) {
777 if(data[i] == ISO_gt) {
778 s.minorstate = MINORSTATE_TEXT;
797 htmlparser_parse(
char *data, uint16_t datalen)
803 plen = parse_word(data, 255);
805 plen = parse_word(data, (uint8_t)datalen);
#define CC_FASTCALL
Configure if the C compiler supports fastcall function declarations.