228 lines
5.4 KiB
C
228 lines
5.4 KiB
C
|
/*
|
||
|
* Copyright (c) 2003, 2004 X/IO Labs, xiolabs.com.
|
||
|
* Copyright (c) 2003, 2004, 2005 Lev Walkin <vlm@lionet.info>.
|
||
|
* All rights reserved.
|
||
|
* Redistribution and modifications are permitted subject to BSD license.
|
||
|
*/
|
||
|
#include <asn_system.h>
|
||
|
#include <xer_support.h>
|
||
|
|
||
|
/* Parser states */
|
||
|
typedef enum {
|
||
|
ST_TEXT,
|
||
|
ST_TAG_START,
|
||
|
ST_TAG_BODY,
|
||
|
ST_TAG_QUOTE_WAIT,
|
||
|
ST_TAG_QUOTED_STRING,
|
||
|
ST_TAG_UNQUOTED_STRING,
|
||
|
ST_COMMENT_WAIT_DASH1, /* "<!--"[1] */
|
||
|
ST_COMMENT_WAIT_DASH2, /* "<!--"[2] */
|
||
|
ST_COMMENT,
|
||
|
ST_COMMENT_CLO_DASH2, /* "-->"[0] */
|
||
|
ST_COMMENT_CLO_RT /* "-->"[1] */
|
||
|
} pstate_e;
|
||
|
|
||
|
static const int
|
||
|
_charclass[256] = {
|
||
|
0,0,0,0,0,0,0,0, 0,1,1,0,1,1,0,0,
|
||
|
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||
|
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||
|
2,2,2,2,2,2,2,2, 2,2,0,0,0,0,0,0, /* 01234567 89 */
|
||
|
0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* ABCDEFG HIJKLMNO */
|
||
|
3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0, /* PQRSTUVW XYZ */
|
||
|
0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* abcdefg hijklmno */
|
||
|
3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0 /* pqrstuvw xyz */
|
||
|
};
|
||
|
#define WHITESPACE(c) (_charclass[(unsigned char)(c)] == 1)
|
||
|
#define ALNUM(c) (_charclass[(unsigned char)(c)] >= 2)
|
||
|
#define ALPHA(c) (_charclass[(unsigned char)(c)] == 3)
|
||
|
|
||
|
/* Aliases for characters, ASCII/UTF-8 */
|
||
|
#define EXCLAM 0x21 /* '!' */
|
||
|
#define CQUOTE 0x22 /* '"' */
|
||
|
#define CDASH 0x2d /* '-' */
|
||
|
#define CSLASH 0x2f /* '/' */
|
||
|
#define LANGLE 0x3c /* '<' */
|
||
|
#define CEQUAL 0x3d /* '=' */
|
||
|
#define RANGLE 0x3e /* '>' */
|
||
|
#define CQUEST 0x3f /* '?' */
|
||
|
|
||
|
/* Invoke token callback */
|
||
|
#define TOKEN_CB_CALL(type, _ns, _current_too, _final) do { \
|
||
|
int _ret; \
|
||
|
pstate_e ns = _ns; \
|
||
|
ssize_t _sz = (p - chunk_start) + _current_too; \
|
||
|
if (!_sz) { \
|
||
|
/* Shortcut */ \
|
||
|
state = _ns; \
|
||
|
break; \
|
||
|
} \
|
||
|
_ret = cb(type, chunk_start, _sz, key); \
|
||
|
if(_ret < _sz) { \
|
||
|
if(_current_too && _ret == -1) \
|
||
|
state = ns; \
|
||
|
goto finish; \
|
||
|
} \
|
||
|
chunk_start = p + _current_too; \
|
||
|
state = ns; \
|
||
|
} while(0)
|
||
|
|
||
|
#define TOKEN_CB(_type, _ns, _current_too) \
|
||
|
TOKEN_CB_CALL(_type, _ns, _current_too, 0)
|
||
|
|
||
|
#define PXML_TAG_FINAL_CHUNK_TYPE PXML_TAG_END
|
||
|
#define PXML_COMMENT_FINAL_CHUNK_TYPE PXML_COMMENT_END
|
||
|
|
||
|
#define TOKEN_CB_FINAL(_type, _ns, _current_too) \
|
||
|
TOKEN_CB_CALL( _type ## _FINAL_CHUNK_TYPE , _ns, _current_too, 1)
|
||
|
|
||
|
/*
|
||
|
* Parser itself
|
||
|
*/
|
||
|
ssize_t pxml_parse(int *stateContext, const void *xmlbuf, size_t size, pxml_callback_f *cb, void *key) {
|
||
|
pstate_e state = (pstate_e)*stateContext;
|
||
|
const char *chunk_start = (const char *)xmlbuf;
|
||
|
const char *p = chunk_start;
|
||
|
const char *end = p + size;
|
||
|
|
||
|
for(; p < end; p++) {
|
||
|
int C = *(const unsigned char *)p;
|
||
|
switch(state) {
|
||
|
case ST_TEXT:
|
||
|
/*
|
||
|
* Initial state: we're in the middle of some text,
|
||
|
* or just have started.
|
||
|
*/
|
||
|
if (C == LANGLE)
|
||
|
/* We're now in the tag, probably */
|
||
|
TOKEN_CB(PXML_TEXT, ST_TAG_START, 0);
|
||
|
break;
|
||
|
case ST_TAG_START:
|
||
|
if (ALPHA(C) || (C == CSLASH))
|
||
|
state = ST_TAG_BODY;
|
||
|
else if (C == EXCLAM)
|
||
|
state = ST_COMMENT_WAIT_DASH1;
|
||
|
else
|
||
|
/*
|
||
|
* Not characters and not whitespace.
|
||
|
* Must be something like "3 < 4".
|
||
|
*/
|
||
|
TOKEN_CB(PXML_TEXT, ST_TEXT, 1);/* Flush as data */
|
||
|
break;
|
||
|
case ST_TAG_BODY:
|
||
|
switch(C) {
|
||
|
case RANGLE:
|
||
|
/* End of the tag */
|
||
|
TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
|
||
|
break;
|
||
|
case LANGLE:
|
||
|
/*
|
||
|
* The previous tag wasn't completed, but still
|
||
|
* recognized as valid. (Mozilla-compatible)
|
||
|
*/
|
||
|
TOKEN_CB_FINAL(PXML_TAG, ST_TAG_START, 0);
|
||
|
break;
|
||
|
case CEQUAL:
|
||
|
state = ST_TAG_QUOTE_WAIT;
|
||
|
break;
|
||
|
}
|
||
|
break;
|
||
|
case ST_TAG_QUOTE_WAIT:
|
||
|
/*
|
||
|
* State after the equal sign ("=") in the tag.
|
||
|
*/
|
||
|
switch(C) {
|
||
|
case CQUOTE:
|
||
|
state = ST_TAG_QUOTED_STRING;
|
||
|
break;
|
||
|
case RANGLE:
|
||
|
/* End of the tag */
|
||
|
TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
|
||
|
break;
|
||
|
default:
|
||
|
if(!WHITESPACE(C))
|
||
|
/* Unquoted string value */
|
||
|
state = ST_TAG_UNQUOTED_STRING;
|
||
|
}
|
||
|
break;
|
||
|
case ST_TAG_QUOTED_STRING:
|
||
|
/*
|
||
|
* Tag attribute's string value in quotes.
|
||
|
*/
|
||
|
if(C == CQUOTE) {
|
||
|
/* Return back to the tag state */
|
||
|
state = ST_TAG_BODY;
|
||
|
}
|
||
|
break;
|
||
|
case ST_TAG_UNQUOTED_STRING:
|
||
|
if(C == RANGLE) {
|
||
|
/* End of the tag */
|
||
|
TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
|
||
|
} else if(WHITESPACE(C)) {
|
||
|
/* Return back to the tag state */
|
||
|
state = ST_TAG_BODY;
|
||
|
}
|
||
|
break;
|
||
|
case ST_COMMENT_WAIT_DASH1:
|
||
|
if(C == CDASH) {
|
||
|
state = ST_COMMENT_WAIT_DASH2;
|
||
|
} else {
|
||
|
/* Some ordinary tag. */
|
||
|
state = ST_TAG_BODY;
|
||
|
}
|
||
|
break;
|
||
|
case ST_COMMENT_WAIT_DASH2:
|
||
|
if(C == CDASH) {
|
||
|
/* Seen "<--" */
|
||
|
state = ST_COMMENT;
|
||
|
} else {
|
||
|
/* Some ordinary tag */
|
||
|
state = ST_TAG_BODY;
|
||
|
}
|
||
|
break;
|
||
|
case ST_COMMENT:
|
||
|
if(C == CDASH) {
|
||
|
state = ST_COMMENT_CLO_DASH2;
|
||
|
}
|
||
|
break;
|
||
|
case ST_COMMENT_CLO_DASH2:
|
||
|
if(C == CDASH) {
|
||
|
state = ST_COMMENT_CLO_RT;
|
||
|
} else {
|
||
|
/* This is not an end of a comment */
|
||
|
state = ST_COMMENT;
|
||
|
}
|
||
|
break;
|
||
|
case ST_COMMENT_CLO_RT:
|
||
|
if(C == RANGLE) {
|
||
|
TOKEN_CB_FINAL(PXML_COMMENT, ST_TEXT, 1);
|
||
|
} else if(C == CDASH) {
|
||
|
/* Maintain current state, still waiting for '>' */
|
||
|
} else {
|
||
|
state = ST_COMMENT;
|
||
|
}
|
||
|
break;
|
||
|
} /* switch(*ptr) */
|
||
|
} /* for() */
|
||
|
|
||
|
/*
|
||
|
* Flush the partially processed chunk, state permitting.
|
||
|
*/
|
||
|
if(p - chunk_start) {
|
||
|
switch (state) {
|
||
|
case ST_COMMENT:
|
||
|
TOKEN_CB(PXML_COMMENT, state, 0);
|
||
|
break;
|
||
|
case ST_TEXT:
|
||
|
TOKEN_CB(PXML_TEXT, state, 0);
|
||
|
break;
|
||
|
default: break; /* a no-op */
|
||
|
}
|
||
|
}
|
||
|
|
||
|
finish:
|
||
|
*stateContext = (int)state;
|
||
|
return chunk_start - (const char *)xmlbuf;
|
||
|
}
|
||
|
|