propane/assets/parser.d.erb

/**
 * @file
 *
 * This file is generated by Propane.
 */

<% if @grammar.modulename %>
module <%= @grammar.modulename %>;

<% end %>
import std.stdio;

/**************************************************************************
 * User code blocks
 *************************************************************************/

<% @grammar.code_blocks.each do |code| %>
<%= code %>
<% end %>

/**************************************************************************
 * Public types
 *************************************************************************/

/* Result codes. */
public enum : size_t
{
    P_SUCCESS,
    P_DECODE_ERROR,
    P_UNEXPECTED_INPUT,
    P_UNEXPECTED_TOKEN,
    P_DROP,
    P_EOF,
}

/** Token ID type. */
public alias Token = <%= get_type_for(@grammar.invalid_token_id) %>;

/** Token IDs. */
public enum : Token
{
<% @grammar.tokens.each_with_index do |token, index| %>
    TOKEN_<%= token.code_name %> = <%= index %>,
<%   unless token.id == index %>
<%     raise "Token ID (#{token.id}) does not match index (#{index}) for token #{token.name}!" %>
<%   end %>
<% end %>
    INVALID_TOKEN_ID = <%= @grammar.invalid_token_id %>,
}

/** Code point type. */
public alias CodePoint = uint;

/** Parser values type(s). */
public static union ParserValue
{
<% @grammar.ptypes.each do |name, typestring| %>
    <%= typestring %> v_<%= name %>;
<% end %>
}

/**
 * A structure to keep track of parser position.
 *
 * This is useful for reporting errors, etc...
 */
public static struct Position
{
    /** Input text row (0-based). */
    uint row;

    /** Input text column (0-based). */
    uint col;
}

/**
 * Lexed token information.
 */
public static struct TokenInfo
{
    /** Text position where the token was found. */
    Position position;

    /** Number of input bytes used by the token. */
    size_t length;

    /** Token identifier. */
    Token token;

    /** Parser value associated with the token. */
    ParserValue pvalue;
}

/**
 * Lexer and parser context.
 *
 * The user must allocate an instance of this structure and pass it to any
 * public API function.
 */
public struct p_context_t
{
    /* Lexer context data. */

    /** Input text. */
    string input;

    /** Input text index (byte offset). */
    size_t input_index;

    /** Input text position (row/column). */
    Position input_position;

    /** Current lexer mode. */
    size_t mode;

    /* Parser context data. */

    /** Parse result value. */
    ParserValue parse_result;
}

/**************************************************************************
 * Public data
 *************************************************************************/

/** Token names. */
public static immutable string[] p_token_names = [
<% @grammar.tokens.each_with_index do |token, index| %>
    "<%= token.name %>",
<% end %>
];

/**************************************************************************
 * Private types
 *************************************************************************/

/* An invalid ID value. */
private enum size_t INVALID_ID = cast(size_t)-1;

/**************************************************************************
 * State initialization
 *************************************************************************/

/**
 * Initialize lexer/parser context structure.
 *
 * @param[out] context
 *   Lexer/parser context structure.
 * @param input
 *   Text input.
 */
public void p_context_init(p_context_t * context, string input)
{
    /* New default-initialized context structure. */
    p_context_t newcontext;

    /* Lexer initialization. */
    newcontext.input = input;
    newcontext.mode = <%= @lexer.mode_id("default") %>;

    /* Copy to the user's context structure. */
    *context = newcontext;
}

/**************************************************************************
 * Decoder
 *************************************************************************/

/**
 * Decode a UTF-8 code point.
 *
 * @param input
 *   Text input to decode.
 * @param[out] out_code_point
 *   The decoded code point is stored here if the return value is P_SUCCESS.
 * @param[out] out_code_point_length
 *   The number of bytes the code point used is stored here if the return value
 *   is P_SUCCESS.
 *
 * @retval P_SUCCESS on a successful code point decode
 * @retval P_DECODE_ERROR when an encoding error is observed
 * @retval P_EOF when the end of the text input is reached
 */
public size_t p_decode_code_point(string input,
    CodePoint * out_code_point, ubyte * out_code_point_length)
{
    if (input.length == 0u)
    {
        return P_EOF;
    }
    char c = input[0];
    CodePoint code_point;
    ubyte code_point_length;
    if ((c & 0x80u) == 0u)
    {
        code_point = c;
        code_point_length = 1u;
    }
    else
    {
        ubyte following_bytes;
        if ((c & 0xE0u) == 0xC0u)
        {
            code_point = c & 0x1Fu;
            following_bytes = 1u;
        }
        else if ((c & 0xF0u) == 0xE0u)
        {
            code_point = c & 0x0Fu;
            following_bytes = 2u;
        }
        else if ((c & 0xF8u) == 0xF0u)
        {
            code_point = c & 0x07u;
            following_bytes = 3u;
        }
        else if ((c & 0xFCu) == 0xF8u)
        {
            code_point = c & 0x03u;
            following_bytes = 4u;
        }
        else if ((c & 0xFEu) == 0xFCu)
        {
            code_point = c & 0x01u;
            following_bytes = 5u;
        }
        else
        {
            return P_DECODE_ERROR;
        }
        if (input.length <= following_bytes)
        {
            return P_DECODE_ERROR;
        }
        code_point_length = cast(ubyte)(following_bytes + 1u);
        for (size_t i = 0u; i < following_bytes; i++)
        {
            char b = input[i + 1u];
            if ((b & 0xC0u) != 0x80u)
            {
                return P_DECODE_ERROR;
            }
            code_point = (code_point << 6u) | (b & 0x3Fu);
        }
    }
    *out_code_point = code_point;
    *out_code_point_length = code_point_length;
    return P_SUCCESS;
}

/**************************************************************************
 * Lexer
 *************************************************************************/

private alias LexerStateID = <%= get_type_for(@lexer.state_table.size) %>;
private enum LexerStateID INVALID_LEXER_STATE_ID = <%= @lexer.state_table.size %>u;
<% user_code_id_count = (@grammar.patterns.map(&:code_id).compact.max || 0) + 1 %>
private alias UserCodeID = <%= get_type_for(user_code_id_count) %>;
private enum UserCodeID INVALID_USER_CODE_ID = <%= user_code_id_count %>u;

private struct Transition
{
    CodePoint first;
    CodePoint last;
    LexerStateID destination_state;
}

private struct LexerState
{
    <%= get_type_for(@lexer.transition_table.size - 1) %> transition_table_index;
    <%= get_type_for(@lexer.state_table.map {|ste| ste[:n_transitions]}.max) %> n_transitions;
    Token token;
    UserCodeID code_id;
    bool accepts;
}

private struct Mode
{
    uint state_table_offset;
}

private struct MatchInfo
{
    size_t length;
    Position delta_position;
    const(LexerState) * accepting_state;
}

private static immutable Transition[] lexer_transitions = [
<% @lexer.transition_table.each do |transition_table_entry| %>
    Transition(<%= transition_table_entry[:first] %>u,
               <%= transition_table_entry[:last] %>u,
               <%= transition_table_entry[:destination] %>u),
<% end %>
];

private static immutable LexerState[] lexer_states = [
<% @lexer.state_table.each do |state_table_entry| %>
LexerState(<%= state_table_entry[:transition_table_index] %>u,
<%= state_table_entry[:n_transitions] %>u,
<%   if state_table_entry[:token] %>
Token(<%= state_table_entry[:token] %>u),
<%   else %>
INVALID_TOKEN_ID,
<%   end %>
<%   if state_table_entry[:code_id] %>
<%= state_table_entry[:code_id] %>u,
<%   else %>
INVALID_USER_CODE_ID,
<%   end %>
<%= state_table_entry[:accepts] %>),
<% end %>
];

private static immutable Mode[] modes = [
<% @lexer.mode_table.each do |mode_table_entry| %>
    Mode(<%= mode_table_entry[:state_table_offset] %>),
<% end %>
];

/**
 * Execute user code associated with a lexer pattern.
 *
 * @param context
 *   Lexer/parser context structure.
 * @param code_id
 *   The ID of the user code block to execute.
 * @param match
 *   Matched text for this pattern.
 * @param out_token_info
 *   Lexer token info in progress.
 *
 * @return Token to accept, or invalid token if the user code does
 *   not explicitly return a token.
 */
private Token lexer_user_code(p_context_t * context, UserCodeID code_id, string match, TokenInfo * out_token_info)
{
    switch (code_id)
    {
<% @grammar.patterns.each do |pattern| %>
<%   if pattern.code_id %>
    case <%= pattern.code_id %>u: {
<%= expand_code(pattern.code, false, nil, pattern) %>
    } break;
<%   end %>
<% end %>
    default: break;
    }

    return INVALID_TOKEN_ID;
}

/**
 * Check if there is a transition from the current lexer state to another
 * based on the given input code point.
 *
 * @param current_state
 *   Current lexer state.
 * @param code_point
 *   Input code point.
 *
 * @return Lexer state to transition to, or INVALID_LEXER_STATE_ID if none.
 */
private LexerStateID check_lexer_transition(uint current_state, uint code_point)
{
    uint transition_table_index = lexer_states[current_state].transition_table_index;
    for (uint i = 0u; i < lexer_states[current_state].n_transitions; i++)
    {
        if ((lexer_transitions[transition_table_index + i].first <= code_point) &&
            (code_point <= lexer_transitions[transition_table_index + i].last))
        {
            return lexer_transitions[transition_table_index + i].destination_state;
        }
    }
    return INVALID_LEXER_STATE_ID;
}

/**
 * Find the longest lexer pattern match at the current position.
 *
 * @param context
 *   Lexer/parser context structure.
 * @param[out] out_token_info
 *   The lexed token information is stored here if the return value is
 *   P_SUCCESS.
 *
 * @reval P_SUCCESS
 *   A token was successfully lexed.
 * @reval P_DECODE_ERROR
 *   The decoder encountered invalid text encoding.
 * @reval P_UNEXPECTED_INPUT
 *   Input text does not match any lexer pattern.
 * @retval P_EOF
 *   The end of the text input was reached.
 */
private size_t find_longest_match(
    p_context_t * context,
    MatchInfo * out_match_info,
    size_t * out_unexpected_input_length)
{
    MatchInfo longest_match;
    MatchInfo attempt_match;
    uint current_state = modes[context.mode].state_table_offset;
    for (;;)
    {
        string input = context.input[(context.input_index + attempt_match.length)..(context.input.length)];
        CodePoint code_point;
        ubyte code_point_length;
        size_t result = p_decode_code_point(input, &code_point, &code_point_length);
        switch (result)
        {
        case P_SUCCESS:
            LexerStateID transition_state = check_lexer_transition(current_state, code_point);
            if (transition_state != INVALID_LEXER_STATE_ID)
            {
                attempt_match.length += code_point_length;
                if (code_point == '\n')
                {
                    attempt_match.delta_position.row++;
                    attempt_match.delta_position.col = 0u;
                }
                else
                {
                    attempt_match.delta_position.col++;
                }
                current_state = transition_state;
                if (lexer_states[current_state].accepts)
                {
                    attempt_match.accepting_state = &lexer_states[current_state];
                    longest_match = attempt_match;
                }
            }
            else if (longest_match.length > 0)
            {
                *out_match_info = longest_match;
                return P_SUCCESS;
            }
            else
            {
                *out_unexpected_input_length = attempt_match.length + code_point_length;
                return P_UNEXPECTED_INPUT;
            }
            break;

        case P_EOF:
            /* We hit EOF. */
            if (longest_match.length > 0)
            {
                /* We have a match, so use it. */
                *out_match_info = longest_match;
                return P_SUCCESS;
            }
            else if (attempt_match.length != 0)
            {
                /* There is a partial match - error! */
                *out_unexpected_input_length = attempt_match.length;
                return P_UNEXPECTED_INPUT;
            }
            else
            {
                /* Valid EOF return. */
                return P_EOF;
            }
            break;

        default:
            return result;
        }
    }
}

/**
 * Attempt to lex the next token in the input stream.
 *
 * @param context
 *   Lexer/parser context structure.
 * @param[out] out_token_info
 *   The lexed token information is stored here if the return value is
 *   P_SUCCESS.
 *
 * @reval P_SUCCESS
 *   A token was successfully lexed.
 * @reval P_DECODE_ERROR
 *   The decoder encountered invalid text encoding.
 * @reval P_UNEXPECTED_INPUT
 *   Input text does not match any lexer pattern.
 * @retval P_DROP
 *   A drop pattern was matched so the lexer should continue.
 */
private size_t attempt_lex_token(p_context_t * context, TokenInfo * out_token_info)
{
    TokenInfo token_info;
    token_info.position = context.input_position;
    token_info.token = INVALID_TOKEN_ID;
    *out_token_info = token_info; // TODO: remove
    MatchInfo match_info;
    size_t unexpected_input_length;
    size_t result = find_longest_match(context, &match_info, &unexpected_input_length);
    switch (result)
    {
    case P_SUCCESS:
        Token token_to_accept = match_info.accepting_state.token;
        if (match_info.accepting_state.code_id != INVALID_USER_CODE_ID)
        {
            Token user_code_token = lexer_user_code(context, match_info.accepting_state.code_id, context.input[context.input_index..(context.input_index + match_info.length)], &token_info);
            /* An invalid Token from lexer_user_code() means that the user
             * code did not explicitly return a token. So only override
             * the token to return if the user code does explicitly
             * return a token. */
            if (user_code_token != INVALID_TOKEN_ID)
            {
                token_to_accept = user_code_token;
            }
        }

        /* Update the input position tracking. */
        context.input_index += match_info.length;
        context.input_position.row += match_info.delta_position.row;
        if (match_info.delta_position.row != 0u)
        {
            context.input_position.col = match_info.delta_position.col;
        }
        else
        {
            context.input_position.col += match_info.delta_position.col;
        }
        if (token_to_accept == INVALID_TOKEN_ID)
        {
            return P_DROP;
        }
        token_info.token = token_to_accept;
        token_info.length = match_info.length;
        *out_token_info = token_info;
        return P_SUCCESS;

    case P_EOF:
        token_info.token = TOKEN___EOF;
        *out_token_info = token_info;
        return P_SUCCESS;

    default:
        return result;
    }
}

/**
 * Lex the next token in the input stream.
 *
 * @param context
 *   Lexer/parser context structure.
 * @param[out] out_token_info
 *   The lexed token information is stored here if the return value is
 *   P_SUCCESS.
 *
 * @reval P_SUCCESS
 *   A token was successfully lexed.
 * @reval P_DECODE_ERROR
 *   The decoder encountered invalid text encoding.
 * @reval P_UNEXPECTED_INPUT
 *   Input text does not match any lexer pattern.
 */
public size_t p_lex(p_context_t * context, TokenInfo * out_token_info)
{
    for (;;)
    {
        size_t result = attempt_lex_token(context, out_token_info);
        if (result != P_DROP)
        {
            return result;
        }
    }
}

/**************************************************************************
 * Parser
 *************************************************************************/

private alias ReduceID = <%= get_type_for(@parser.reduce_table.size) %>;
<% # A "symbol" is either a token ID or a rule set ID. %>
<% # %>
<% # Rule set IDs start after token IDs, so to store either a token ID %>
<% # or a rule set ID, we just need to know the maximum rule set ID. %>
private alias SymbolID = <%= get_type_for(@parser.rule_sets.map(&:last).map(&:id).max) %>;
private alias StateID = <%= get_type_for(@parser.state_table.size) %>;
private alias RuleID = <%= get_type_for(@grammar.rules.size) %>;
private alias ShiftID = <%= get_type_for(@parser.shift_table.size) %>;

private struct Shift
{
    SymbolID symbol;
    StateID state;
}

private struct Reduce
{
    Token token;
    RuleID rule;
    SymbolID rule_set;
    StateID n_states;
}

private struct ParserState
{
    ShiftID shift_table_index;
    ShiftID n_shift_entries;
    ReduceID reduce_table_index;
    ReduceID n_reduce_entries;
}

private struct StateValue
{
    size_t state;
    ParserValue pvalue;

    this(size_t state)
    {
        this.state = state;
    }
}

private static immutable Shift[] parser_shifts = [
<%   @parser.shift_table.each do |shift| %>
    Shift(<%= shift[:symbol_id] %>u, <%= shift[:state_id] %>u),
<%   end %>
];

private static immutable Reduce[] parser_reduces = [
<%   @parser.reduce_table.each do |reduce| %>
    Reduce(<%= reduce[:token_id] %>u, <%= reduce[:rule_id] %>u, <%= reduce[:rule_set_id] %>u, <%= reduce[:n_states] %>u),
<%   end %>
];

private static immutable ParserState[] parser_states = [
<%   @parser.state_table.each do |state| %>
    ParserState(<%= state[:shift_index] %>u, <%= state[:n_shifts] %>u, <%= state[:reduce_index] %>u, <%= state[:n_reduces] %>u),
<%   end %>
];

/**
 * Execute user code associated with a parser rule.
 *
 * @param rule The ID of the rule.
 *
 * @return Parse value.
 */
private ParserValue parser_user_code(uint rule, StateValue[] statevalues, uint n_states)
{
    ParserValue _pvalue;

    switch (rule)
    {
<% @grammar.rules.each do |rule| %>
<%   if rule.code %>
    case <%= rule.id %>u: {
<%= expand_code(rule.code, true, rule, nil) %>
    } break;
<%   end %>
<% end %>
    default: break;
    }

    return _pvalue;
}

/**
 * Check if the parser should shift to a new state.
 *
 * @param state
 *   Parser state ID.
 * @param symbol
 *   Incoming token/rule set ID.
 *
 * @return State to shift to, or INVALID_ID if none.
 */
private size_t check_shift(size_t state, size_t symbol)
{
    uint start = parser_states[state].shift_table_index;
    uint end = start + parser_states[state].n_shift_entries;
    for (uint i = start; i < end; i++)
    {
        if (parser_shifts[i].symbol == symbol)
        {
//                    if (symbol != INVALID_TOKEN_ID)
//                    {
//                        writeln("Shifting ", p_token_names[symbol]);
//                    }
//                    else
//                    {
//                        writeln("Shifting rule set ", symbol);
//                    }
            return parser_shifts[i].state;
        }
    }
    return INVALID_ID;
}

/**
 * Check if the parser should reduce to a new state.
 *
 * @param state
 *   Parser state ID.
 * @param token
 *   Incoming token ID.
 *
 * @return State to reduce to, or INVALID_ID if none.
 */
private size_t check_reduce(size_t state, Token token)
{
    size_t start = parser_states[state].reduce_table_index;
    size_t end = start + parser_states[state].n_reduce_entries;
    for (size_t i = start; i < end; i++)
    {
        if ((parser_reduces[i].token == token) ||
            (parser_reduces[i].token == INVALID_TOKEN_ID))
        {
//                    write("Reducing rule ", parser_reduces[i].rule, ", rule set ", parser_reduces[i].rule_set, " lookahead ");
//                    if (token != INVALID_TOKEN_ID)
//                    {
//                        writeln(p_token_names[token]);
//                    }
//                    else
//                    {
//                        writeln("{other}");
//                    }
            return i;
        }
    }
    return INVALID_ID;
}

/**
 * Run the parser.
 *
 * @param context
 *   Lexer/parser context structure.
 *
 * @retval P_SUCCESS
 *   The parser successfully matched the input text. The parse result value
 *   can be accessed with p_result().
 * @retval P_UNEXPECTED_TOKEN
 *   An unexpected token was encountered that does not match any grammar rule.
 * @reval P_DECODE_ERROR
 *   The decoder encountered invalid text encoding.
 * @reval P_UNEXPECTED_INPUT
 *   Input text does not match any lexer pattern.
 */
public size_t p_parse(p_context_t * context)
{
    TokenInfo token_info;
    Token token = INVALID_TOKEN_ID;
    StateValue[] statevalues = new StateValue[](1);
    size_t reduced_rule_set = INVALID_ID;
    ParserValue reduced_parser_value;
    for (;;)
    {
        if (token == INVALID_TOKEN_ID)
        {
            size_t lexer_result = p_lex(context, &token_info);
            if (lexer_result != P_SUCCESS)
            {
                return lexer_result;
            }
            token = token_info.token;
        }
        size_t shift_state = INVALID_ID;
        if (reduced_rule_set != INVALID_ID)
        {
            shift_state = check_shift(statevalues[$-1].state, reduced_rule_set);
        }
        if (shift_state == INVALID_ID)
        {
            shift_state = check_shift(statevalues[$-1].state, token);
            if ((shift_state != INVALID_ID) && (token == TOKEN___EOF))
            {
                /* Successful parse. */
                context.parse_result = statevalues[$-1].pvalue;
                return P_SUCCESS;
            }
        }
        if (shift_state != INVALID_ID)
        {
            /* We have something to shift. */
            statevalues ~= StateValue(shift_state);
            if (reduced_rule_set == INVALID_ID)
            {
                /* We shifted a token, mark it consumed. */
                token = INVALID_TOKEN_ID;
                statevalues[$-1].pvalue = token_info.pvalue;
            }
            else
            {
                /* We shifted a RuleSet. */
                statevalues[$-1].pvalue = reduced_parser_value;
                ParserValue new_parse_result;
                reduced_parser_value = new_parse_result;
                reduced_rule_set = INVALID_ID;
            }
            continue;
        }

        size_t reduce_index = check_reduce(statevalues[$-1].state, token);
        if (reduce_index != INVALID_ID)
        {
            /* We have something to reduce. */
            reduced_parser_value = parser_user_code(parser_reduces[reduce_index].rule, statevalues, parser_reduces[reduce_index].n_states);
            reduced_rule_set = parser_reduces[reduce_index].rule_set;
            statevalues.length -= parser_reduces[reduce_index].n_states;
            continue;
        }

        /* Error, unexpected token. */
        write("Unexpected token ");
        if (token != INVALID_TOKEN_ID)
        {
            writeln(p_token_names[token]);
        }
        else
        {
            writeln("{other}");
        }
        return P_UNEXPECTED_TOKEN;
    }
}

/**
 * Get the parse result value.
 *
 * @param context
 *   Lexer/parser context structure.
 *
 * @return Parse result value.
 */
public <%= start_rule_type[1] %> p_result(p_context_t * context)
{
    return context.parse_result.v_<%= start_rule_type[0] %>;
}