/* libCStreamedXML/src/libCStreamedXML/300_parser.c
 *
 *  (c)2006-2009, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
*/

//#define DEBUG_STATE_MACHINE

#ifdef DEBUG_STATE_MACHINE
#include <stdio.h>
#endif

#include <sys/types.h>
#include <stdlib.h>
#include <string.h>



enum csxml_State {
    StateNone, // at element or stream level
    StateRestartMarker, // after <![RESTART[
    StateRestartMarker1, // after first ]
    StateRestartMarker2, // after second ]
    StateOpen, // after <
    StateOpenBang, // after <!
    StateOpenBangMarker, // after <![
    StateOpenCdataMarker, // after <![C
    StateOpenRestartMarker, // after <![R
    StateCDATA,
    StateCDATA1, // first ]
    StateCDATA2, // second ]
    StateData, // like StateNone, but don't skip whitespace
    StateOpenComment, // after <!-
    StateComment,
    StateComment2,
    StateComment3,
    StatePI, // after <?    (we are currently parsing the target)
    StatePIData, // after target
    StatePI2, // first ? encountered (in target)
    StatePI3, // first ? encountered (in data)
    StateClose, // "</" encountered
    StateClosing, // "</x"
    StateNeedClose2, // "</xyz "
    StateElemName, // <x encountered
    StateElemTag,  // first space after "<name" encountered
    StateElemAttrName,
    StateElemAttrEq, // name=
    StateElemAttrVal, // name=' or name="
    StateElemAttrDone,  // name='value'
    StateNeedClose, // <elem/
    StateEntity, // &
    StateCharEntity, // &#
    StateEntityName, // &x
    StateError
};

static const char* xmlCdataMarker = "<![CDATA["; 
static const char* xmlRestartMarker = "<![RESTART[";



enum TokenClass {
    ClassInvalid,
    ClassWhitespace,
    ClassEntity,
    ClassOther,
    ClassOpenTag,
    ClassNameChar,
    ClassNameStartChar
};

static const enum TokenClass token_classes[] = {
    ClassInvalid, ClassInvalid, ClassInvalid, ClassInvalid, // control chars 00--03
    ClassInvalid, ClassInvalid, ClassInvalid, ClassInvalid, // control chars 04--07
    ClassInvalid, ClassWhitespace, ClassWhitespace, ClassInvalid, // control 08, tab, newline, control 0B
    ClassInvalid, ClassWhitespace, ClassInvalid, ClassInvalid, // control 0C, carriage return, control 0E--0F
    ClassInvalid, ClassInvalid, ClassInvalid, ClassInvalid, // control chars 10--13
    ClassInvalid, ClassInvalid, ClassInvalid, ClassInvalid, // control chars 14--17
    ClassInvalid, ClassInvalid, ClassInvalid, ClassInvalid, // control chars 18--1B
    ClassInvalid, ClassInvalid, ClassInvalid, ClassInvalid, // control chars 1C--1F
    ClassWhitespace, ClassOther, ClassOther, ClassOther, // ' ', '!', '"', '#'
    ClassOther, ClassOther, ClassEntity, ClassOther, // '$', '%', '&', '\''
    ClassOther, ClassOther, ClassOther, ClassOther, // '(', ')', '*', '+'
    ClassOther, ClassNameChar, ClassNameChar, ClassOther, // ',', '-', '.', '/'
    ClassNameChar, ClassNameChar, ClassNameChar, ClassNameChar, // '0'--'3'
    ClassNameChar, ClassNameChar, ClassNameChar, ClassNameChar, // '4'--'7'
    ClassNameChar, ClassNameChar, ClassNameStartChar, ClassOther, // '8', '9', ':', ';'
    ClassOpenTag, ClassOther, ClassOther, ClassOther, // '<', '=', '>', '?'
    ClassOther, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // '@', 'A'--'C'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // 'D'--'G'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // 'H'--'K'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // 'L'--'O'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // 'P'--'S'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // 'T'--'W'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassOther, // 'X'--'Z', '['
    ClassOther, ClassOther, ClassOther, ClassNameStartChar, // '\\', ']', '^', '_'
    ClassOther, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // '`', 'a'--'c'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // 'd'--'g'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // 'h'--'k'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // 'l'--'o'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // 'p'--'s'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, // 't'--'w'
    ClassNameStartChar, ClassNameStartChar, ClassNameStartChar, ClassOther, // 'x'--'z', '{'
    ClassOther, ClassOther, ClassOther, ClassInvalid // '|', '}', '~', control 7F
};



int csxml_feedChar(struct csxml* ctx, char ch)
{
    enum TokenClass c;
    int try;

#ifdef DEBUG_STATE_MACHINE
    printf("%p: Character '%c' (%02X), state %d, buffer `%s', restartCount %d\n", ctx, ch, ch, ctx->state, ctx->buffer.data, ctx->restartCount);
#endif

#define ERROR(_reason) do { \
    ctx->state = StateError; \
    ctx->notWellFormed(ctx, _reason); \
    return -1; \
}while(0)

#define APPEND_CH(_whichbuf, _ch) do { \
    ctx-> _whichbuf .data[ctx-> _whichbuf .len++] = _ch; \
    if((ctx-> _whichbuf .len == ctx-> _whichbuf .size) && \
        do_realloc(ctx, &ctx-> _whichbuf)) return -1; \
    ctx-> _whichbuf .data[ctx-> _whichbuf .len] = 0; \
}while(0)

#define CLEAR_BUFFER(_whichbuf) do { \
    ctx-> _whichbuf .data[0] = 0; \
    ctx-> _whichbuf .len = 0; \
}while(0)

#define TRY(_x) do { \
    try = (_x); \
    if(try) { \
        ctx->state = StateError; \
        return try; \
    } \
}while(0)

    if(ch == xmlRestartMarker[ctx->restartCount]) {
        if(++ctx->restartCount == 11) {
            csxml_reset(ctx);
            ctx->state = StateRestartMarker;
            return 0;
        }
    } else {
        ctx->restartCount = 0;
    }

    if(ch & ~0x7F) c = ClassInvalid;
    else c = token_classes[(int)ch];

    if(ch == '\r') {
        ctx->skipNextNewline = 1;
        ch = '\n';
        ++ctx->line;
        ctx->col = 0;
    } else if(ch == '\n') {
        if(ctx->skipNextNewline) return 0;
        ++ctx->line;
        ctx->col = 0;
    } else {
        ctx->skipNextNewline = 0;
    }

    if(c == ClassInvalid) ERROR("Restricted character encountered.");

    // deal with char appropriately, according to state
    switch(ctx->state) {
    case StateError:
        return 0;

    case StateNone:
        switch(c) {
        case ClassWhitespace:
            APPEND_CH(buffer, ch);
            break;

        case ClassOpenTag:
            ctx->state = StateOpen;
            if(ctx->buffer.len) TRY(ctx->whiteSpace(ctx, ctx->buffer.data));
            CLEAR_BUFFER(buffer);
            break;

        case ClassEntity:
            if(ctx->expandEntities) {
                if(!ctx->elementDepth) ERROR("Entities cannot appear at stream level.");
                if(ctx->buffer.len) TRY(ctx->whiteSpace(ctx, ctx->buffer.data));
                CLEAR_BUFFER(buffer);
                ctx->parsingAttr = 0;
                ctx->state = StateEntity;
                break;
            }

            // fall through
        default:
            if(!ctx->elementDepth) ERROR("Content cannot appear at stream level.");
            ctx->state = StateData;
            if(ctx->buffer.len) TRY(ctx->whiteSpace(ctx, ctx->buffer.data));
            CLEAR_BUFFER(buffer);
            APPEND_CH(buffer, ch);
            break;
        }
        break;

    case StateData:
        switch(c) {
        case ClassOpenTag:
            ctx->state = StateOpen;
            TRY(ctx->content(ctx, ctx->buffer.data));
            CLEAR_BUFFER(buffer);
            break;

        case ClassEntity:
            if(ctx->expandEntities) {
                ctx->parsingAttr = 0;
                ctx->state = StateEntity;
                break;
            }

            /* fall through */
        default:
            APPEND_CH(buffer, ch);
            break;
        }
        break;

    case StateCDATA:
        if(ch == ']') ctx->state = StateCDATA1;
        else APPEND_CH(buffer, ch);
        break;

    case StateCDATA1:
        if(ch == ']') ctx->state = StateCDATA2;
        else {
            APPEND_CH(buffer, ']');
            APPEND_CH(buffer, ch);
            ctx->state = StateCDATA;
        }
        break;

    case StateCDATA2:
        if(ch == '>') {
            ctx->state = StateNone;
            TRY(ctx->cdata(ctx, ctx->buffer.data));
            CLEAR_BUFFER(buffer);
        } else if(ch == ']') {
            APPEND_CH(buffer, ']');
        } else {
            APPEND_CH(buffer, ']');
            APPEND_CH(buffer, ']');
            APPEND_CH(buffer, ch);
            ctx->state = StateCDATA;
        }
        break;

    case StateRestartMarker:
        if(ch == ']') ctx->state = StateRestartMarker1;
        else APPEND_CH(buffer, ch);
        break;

    case StateRestartMarker1:
        if(ch == ']') ctx->state = StateRestartMarker2;
        else {
            APPEND_CH(buffer, ']');
            APPEND_CH(buffer, ch);
            ctx->state = StateRestartMarker;
        }
        break;

    case StateRestartMarker2:
        if(ch == '>') {
            TRY(ctx->streamRestart(ctx, ctx->buffer.data));
            csxml_reset(ctx);

        } else if(ch == ']') {
            APPEND_CH(buffer, ']');

        } else {
            APPEND_CH(buffer, ']');
            APPEND_CH(buffer, ']');
            APPEND_CH(buffer, ch);
            ctx->state = StateRestartMarker;
        }
        break;

    case StateOpen:
        switch(c) {
            case ClassNameStartChar:
                ctx->state = StateElemName;
                ctx->elemAttrNames.len = 0;
                ctx->elemAttrVals.len = 0;
                CLEAR_BUFFER(elemName);
                APPEND_CH(elemName, ch);
                break;

            default:
                if(ch == '!') ctx->state = StateOpenBang;
                else if(ch == '?') {
                    ctx->state = StatePI;
                    CLEAR_BUFFER(buffer2);
                } else if(ch == '/') {
                    if(!ctx->elementDepth) ERROR("Encountered a close tag at stream level.");
                    ctx->state = StateClose;
                } else ERROR("Invalid start character for element.");
                break;
        }
        break;

    case StatePI:
        if(ch == '?') ctx->state = StatePI2;
        else if(c == ClassWhitespace) {
            ctx->state = StatePIData;
            CLEAR_BUFFER(buffer);
        } else APPEND_CH(buffer2, ch);
        break;

    case StatePI2:
        if(ch != '>') ERROR("Invalid target for PI");
        else {
            ctx->state = StateNone;
            TRY(ctx->PI(ctx, ctx->buffer2.data, 0));
        }
        break;

    case StatePIData:
        if(ch == '?') ctx->state = StatePI3;
        else APPEND_CH(buffer, ch);
        break;

    case StatePI3:
        if(ch == '>') {
            ctx->state = StateNone;
            TRY(ctx->PI(ctx, ctx->buffer2.data, ctx->buffer.data));
            CLEAR_BUFFER(buffer);
        } else if(ch == '?') {
            APPEND_CH(buffer, '?');
        } else {
            APPEND_CH(buffer, '?');
            APPEND_CH(buffer, ch);
            ctx->state = StatePIData;
        }
        break;

    case StateOpenBang:
        if(ch == '[') {
            ctx->state = StateOpenBangMarker;
            ctx->xmlCount = 3;
        } else if(ch == '-') ctx->state = StateOpenComment;
        else ERROR("Invalid special tag.");
        break;

    case StateOpenBangMarker:
        if(ch == xmlCdataMarker[ctx->xmlCount]) {
            ctx->state = StateOpenCdataMarker;
            if(!ctx->elementDepth) ERROR("CDATA sections not valid at stream level.");
        } else if(ch == xmlRestartMarker[ctx->xmlCount]) {
            ctx->state = StateOpenRestartMarker;
        } else {
            ERROR("Invalid marked section.");
        }
        ++(ctx->xmlCount);
        break;

    case StateOpenCdataMarker:
        if(ch != xmlCdataMarker[ctx->xmlCount]) ERROR("Invalid marked section.");
        if(!xmlCdataMarker[++ctx->xmlCount]) ctx->state = StateCDATA;
        break;

    case StateOpenRestartMarker:
        if(ch != xmlRestartMarker[ctx->xmlCount++]) ERROR("Invalid marked section.");
        /* Restart markers handled by lower layer -- if(!xmlRestartMarker[++ctx->xmlCount]) { } */
        break;

    case StateOpenComment:
        if(ch != '-') ERROR("Invalid special tag.");
        ctx->state = StateComment;
        CLEAR_BUFFER(buffer);
        break;

    case StateComment:
        if(ch == '-') ctx->state = StateComment2;
        else APPEND_CH(buffer, ch);
        break;

    case StateComment2:
        if(ch == '-') ctx->state = StateComment3;
        else {
            APPEND_CH(buffer, '-');
            APPEND_CH(buffer, ch);
            ctx->state = StateComment;
        }
        break;

    case StateComment3:
        if(ch != '>') ERROR("`--' not valid in comments");
        ctx->state = StateNone;
        TRY(ctx->comment(ctx, ctx->buffer.data));
        CLEAR_BUFFER(buffer);
        break;

    case StateElemName:
        switch(c) {
        case ClassWhitespace:
            ctx->state = StateElemTag;
            CLEAR_BUFFER(buffer);
            break;

        case ClassNameStartChar:
        case ClassNameChar:
            APPEND_CH(elemName, ch);
            break;

        default:
            switch(ch) {
            case L'>':
                CLEAR_BUFFER(buffer);
                TRY(list_push(ctx, &ctx->elemStack, &ctx->elemName));
                ctx->state = StateNone;
                TRY(ctx->element(ctx, ctx->elemName.data, 0));
                ++ctx->elementDepth;
                break;

            case L'/':
                ctx->state = StateNeedClose;
                break;

            default:
                ERROR("Invalid character in tag name.");
            }
        }
        break;

    case StateElemTag:
        switch(c) {
        case ClassWhitespace:
            break;

        case ClassNameStartChar:
            ctx->state = StateElemAttrName;
            CLEAR_BUFFER(buffer);
            APPEND_CH(buffer, ch);
            break;

        default:
            switch(ch) {
            case '>':
                TRY(list_push(ctx, &ctx->elemStack, &ctx->elemName));
                ctx->state = StateNone;
                TRY(ctx->element(ctx, ctx->elemName.data, ctx->elemAttrNames.len));
                ++ctx->elementDepth;
                break;

            case '/':
                ctx->state = StateNeedClose;
                break;

            default:
                ERROR("Invalid character in tag.");
            }
        }
        break;

    case StateElemAttrName:
        switch(c) {
        case ClassNameStartChar:
        case ClassNameChar:
            APPEND_CH(buffer, ch);
            break;

        default:
            if(ch != '=') ERROR("Invalid character in attribute name.");
            for(try = 0; (size_t)try < ctx->elemAttrNames.len; ++try) {
                if(!strcmp(ctx->elemAttrNames.data[try].data, ctx->buffer.data)) 
                    ERROR("Duplicate attribute in element.");
            }
            TRY(list_push(ctx, &ctx->elemAttrNames, &ctx->buffer));
            CLEAR_BUFFER(buffer);
            ctx->state = StateElemAttrEq;
            break;
        }
        break;

    case StateElemAttrEq:
        if(ch == '\'') ctx->singleQuote = 1;
        else if(ch == '"') ctx->singleQuote = 0;
        else ERROR("Invalid character in attribute.");
        ctx->state = StateElemAttrVal;
        break;

    case StateElemAttrVal:
        if((ctx->singleQuote && ch == '\'') || (!ctx->singleQuote && ch == '"')) {
            TRY(list_push(ctx, &ctx->elemAttrVals, &ctx->buffer));
            ctx->state = StateElemAttrDone;
        } else if(ctx->expandEntities && ch == L'&') {
            ctx->parsingAttr = 1;
            ctx->state = StateEntity;
        } else APPEND_CH(buffer, ch);
        break;

    case StateElemAttrDone:
        switch(c) {
        case ClassWhitespace:
            ctx->state = StateElemTag;
            break;

        default:
            if(ch == '/') {
                ctx->state = StateNeedClose;
            } else if(ch == '>') {
                ctx->state = StateNone;
                CLEAR_BUFFER(buffer);
                TRY(ctx->element(ctx, ctx->elemName.data, ctx->elemAttrVals.len));
                TRY(list_push(ctx, &ctx->elemStack, &ctx->elemName));
                ++ctx->elementDepth;
            } else ERROR("Invalid character after attribute.");
            break;
        }
        break;

    case StateNeedClose:
        if(ch != '>') ERROR("Stray `/' in open tag.");
        ctx->state = StateNone;
        CLEAR_BUFFER(buffer);
        TRY(ctx->element(ctx, ctx->elemName.data, ctx->elemAttrVals.len));
        TRY(ctx->closeTag(ctx, ctx->elemName.data));
        break;

    case StateClose:
        if(c != ClassNameStartChar) ERROR("Invalid character in close tag name.");
        APPEND_CH(buffer, ch);
        ctx->state = StateClosing;
        break;

    case StateClosing:
        switch(c) {
        case ClassNameStartChar:
        case ClassNameChar:
            APPEND_CH(buffer, ch);
            break;

        case ClassWhitespace:
            ctx->state = StateNeedClose2;
            break;

        default:
            if(ch != '>') ERROR("Invalid character in close tag name.");
            TRY(buffer_copy(ctx, &ctx->elemName, list_pop(&ctx->elemStack)));
            if(strcmp(ctx->elemName.data, ctx->buffer.data)) ERROR("Mismatched close tag.");
            ctx->state = StateNone;
            CLEAR_BUFFER(buffer);
            TRY(ctx->closeTag(ctx, ctx->elemName.data));
            --ctx->elementDepth;
        }
        break;

    case StateNeedClose2:
        if(c == ClassWhitespace) break;
        if(ch != '>') ERROR("Invalid data in close tag.");
        TRY(buffer_copy(ctx, &ctx->elemName, list_pop(&ctx->elemStack)));
        if(strcmp(ctx->elemName.data, ctx->buffer.data)) ERROR("Mismatched close tag.");
        ctx->state = StateNone;
        CLEAR_BUFFER(buffer);
        TRY(ctx->closeTag(ctx, ctx->elemName.data));
        --ctx->elementDepth;
        break;

    case StateEntity:
        if(ch == '#') {
            ctx->state = StateCharEntity;
            ctx->entityChar = 0;
        } else if(c == ClassNameStartChar) {
            CLEAR_BUFFER(buffer2);
            APPEND_CH(buffer2, ch);
            ctx->state = StateEntityName;
        } else ERROR("Invalid entity name.");
        break;

    case StateCharEntity:
        if(ch == ';') {
            APPEND_CH(buffer, ctx->entityChar);
            ctx->state = ctx->parsingAttr ? StateElemAttrVal : StateData;
            break;

        } else if(ch >= '0' && ch <= '9') {
            ctx->entityChar *= 10;
            ctx->entityChar += (ch - '0');
            if(!ctx->entityChar || ctx->entityChar > 126) ERROR("Character code out of range in character entity.");
        } else ERROR("Invalid character in character entity.");
        break;

    case StateEntityName:
        if(ch == ';') {
            const char* e = csxml_entityRef(ctx, ctx->buffer2.data);
            TRY(!e || buffer_strcat(ctx, &ctx->buffer, e));
            ctx->state = ctx->parsingAttr ? StateElemAttrVal : StateData;
            break;
        }
        if(c != ClassNameChar && c != ClassNameStartChar) ERROR("Invalid entity name.");
        APPEND_CH(buffer2, ch);
        break;
    }
    ++ctx->col;
    return 0;
}



const char* csxml_entityRef(struct csxml* ctx, const char* ent)
{
    const char* q = 0;

    if(!strcmp(ent, "quot")) return "\"";
    if(!strcmp(ent, "amp")) return "&";
    if(!strcmp(ent, "apos")) return "'";
    if(!strcmp(ent, "lt")) return "<";
    if(!strcmp(ent, "gt")) return ">";

    q = ctx->entityRef(ctx, ent);
    if(!q) {
        ctx->state = StateError;
        ctx->unknownEntity(ctx, ent);
        return 0;
    }

    return q;
}



void csxml_reset(struct csxml* ctx)
{
    CLEAR_BUFFER(buffer);
    CLEAR_BUFFER(buffer2);
    CLEAR_BUFFER(elemName);
    ctx->state = StateNone;
    ctx->xmlCount = 0;
    ctx->elementDepth = 0;
    ctx->restartCount = 0;
    ctx->skipNextNewline = 0;
    ctx->parsingAttr = 0;
    ctx->elemStack.len = 0;
    ctx->elemAttrNames.len = 0;
    ctx->elemAttrVals.len = 0;
}



#undef ERROR
#undef APPEND_CH
#undef CLEAR_BUFFER



void csxml_freeParser(struct csxml* ctx)
{
    if(!ctx) return;
    buffer_free(&ctx->buffer);
    buffer_free(&ctx->buffer2);
    buffer_free(&ctx->elemName);
    list_free(&ctx->elemStack);
    list_free(&ctx->elemAttrNames);
    list_free(&ctx->elemAttrVals);

    free(ctx);
}



struct csxml* csxml_newParser()
{
    struct csxml* ctx = 0;

    ctx = malloc(sizeof(struct csxml));
    if(!ctx) return 0;
    memset(ctx, 0, sizeof(struct csxml));

    if(buffer_init(ctx, &ctx->buffer) || buffer_init(ctx, &ctx->buffer2) || buffer_init(ctx, &ctx->elemName)) {
        csxml_freeParser(ctx);
        return 0;
    }

    ctx->expandEntities = 1;

    ctx->notWellFormed = default_notWellFormed;
    ctx->outOfMemory = default_outOfMemory;
    ctx->unknownEntity = default_unknownEntity;
    ctx->whiteSpace = default_discard;
    ctx->content = default_discard;
    ctx->cdata = default_cdata;
    ctx->streamRestart = default_discard;
    ctx->PI = default_discardPI;
    ctx->comment = default_discard;
    ctx->element = default_discardElem;
    ctx->closeTag = default_discard;
    ctx->entityRef = default_discardEnt;

    return ctx;
}



void csxml_feedData(struct csxml* ctx, const char* data, size_t amt)
{
    while(amt--) csxml_feedChar(ctx, *data++);
}



/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
vim: expandtab:ts=4:sw=4
*/
