/** * @file * * This file is generated by Propane. */ <% if @grammar.modulename %> module <%= @grammar.modulename %>; <% end %> /************************************************************************** * User code blocks *************************************************************************/ <%= @grammar.code_blocks.fetch("", "") %> /************************************************************************** * Public types *************************************************************************/ /* Result codes. */ public enum : size_t { <%= @grammar.prefix.upcase %>SUCCESS, <%= @grammar.prefix.upcase %>DECODE_ERROR, <%= @grammar.prefix.upcase %>UNEXPECTED_INPUT, <%= @grammar.prefix.upcase %>UNEXPECTED_TOKEN, <%= @grammar.prefix.upcase %>DROP, <%= @grammar.prefix.upcase %>EOF, } /** Token type. */ public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.invalid_token_id) %>; /** Token IDs. */ public enum : <%= @grammar.prefix %>token_t { <% @grammar.tokens.each_with_index do |token, index| %> TOKEN_<%= token.code_name %> = <%= index %>, <% unless token.id == index %> <% raise "Token ID (#{token.id}) does not match index (#{index}) for token #{token.name}!" %> <% end %> <% end %> INVALID_TOKEN_ID = <%= @grammar.invalid_token_id %>, } /** Code point type. */ public alias <%= @grammar.prefix %>code_point_t = uint; /** Parser values type(s). */ public union <%= @grammar.prefix %>value_t { <% @grammar.ptypes.each do |name, typestring| %> <%= typestring %> v_<%= name %>; <% end %> } /** * A structure to keep track of parser position. * * This is useful for reporting errors, etc... */ public struct <%= @grammar.prefix %>position_t { /** Input text row (0-based). */ uint row; /** Input text column (0-based). */ uint col; } /** Lexed token information. */ public struct <%= @grammar.prefix %>token_info_t { /** Text position where the token was found. */ <%= @grammar.prefix %>position_t position; /** Number of input bytes used by the token. */ size_t length; /** Token that was lexed. */ <%= @grammar.prefix %>token_t token; /** Parser value associated with the token. */ <%= @grammar.prefix %>value_t pvalue; } /** * Lexer and parser context. * * The user must allocate an instance of this structure and pass it to any * public API function. */ public struct <%= @grammar.prefix %>context_t { /* Lexer context data. */ /** Input text. */ string input; /** Input text index (byte offset). */ size_t input_index; /** Input text position (row/column). */ <%= @grammar.prefix %>position_t text_position; /** Current lexer mode. */ size_t mode; /* Parser context data. */ /** Parse result value. */ <%= @grammar.prefix %>value_t parse_result; /** Unexpected token received. */ <%= @grammar.prefix %>token_t token; } /************************************************************************** * Public data *************************************************************************/ /** Token names. */ public immutable string[] <%= @grammar.prefix %>token_names = [ <% @grammar.tokens.each_with_index do |token, index| %> "<%= token.name %>", <% end %> ]; /************************************************************************** * Private types *************************************************************************/ <% if @grammar.prefix.upcase != "P_" %> /* Result codes. */ private enum : size_t { P_SUCCESS, P_DECODE_ERROR, P_UNEXPECTED_INPUT, P_UNEXPECTED_TOKEN, P_DROP, P_EOF, } <% end %> /* An invalid ID value. */ private enum size_t INVALID_ID = cast(size_t)-1; /************************************************************************** * State initialization *************************************************************************/ /** * Initialize lexer/parser context structure. * * @param[out] context * Lexer/parser context structure. * @param input * Text input. */ public void <%= @grammar.prefix %>context_init(<%= @grammar.prefix %>context_t * context, string input) { /* New default-initialized context structure. */ <%= @grammar.prefix %>context_t newcontext; /* Lexer initialization. */ newcontext.input = input; newcontext.mode = <%= @lexer.mode_id("default") %>; /* Copy to the user's context structure. */ *context = newcontext; } /************************************************************************** * Decoder *************************************************************************/ /** * Decode a UTF-8 code point. * * @param input * Text input to decode. * @param[out] out_code_point * The decoded code point is stored here if the return value is P_SUCCESS. * @param[out] out_code_point_length * The number of bytes the code point used is stored here if the return value * is P_SUCCESS. * * @retval P_SUCCESS on a successful code point decode * @retval P_DECODE_ERROR when an encoding error is observed * @retval P_EOF when the end of the text input is reached */ public size_t <%= @grammar.prefix %>decode_code_point(string input, <%= @grammar.prefix %>code_point_t * out_code_point, ubyte * out_code_point_length) { if (input.length == 0u) { return P_EOF; } char c = input[0]; <%= @grammar.prefix %>code_point_t code_point; ubyte code_point_length; if ((c & 0x80u) == 0u) { code_point = c; code_point_length = 1u; } else { ubyte following_bytes; if ((c & 0xE0u) == 0xC0u) { code_point = c & 0x1Fu; following_bytes = 1u; } else if ((c & 0xF0u) == 0xE0u) { code_point = c & 0x0Fu; following_bytes = 2u; } else if ((c & 0xF8u) == 0xF0u) { code_point = c & 0x07u; following_bytes = 3u; } else if ((c & 0xFCu) == 0xF8u) { code_point = c & 0x03u; following_bytes = 4u; } else if ((c & 0xFEu) == 0xFCu) { code_point = c & 0x01u; following_bytes = 5u; } else { return P_DECODE_ERROR; } if (input.length <= following_bytes) { return P_DECODE_ERROR; } code_point_length = cast(ubyte)(following_bytes + 1u); for (size_t i = 0u; i < following_bytes; i++) { char b = input[i + 1u]; if ((b & 0xC0u) != 0x80u) { return P_DECODE_ERROR; } code_point = (code_point << 6u) | (b & 0x3Fu); } } *out_code_point = code_point; *out_code_point_length = code_point_length; return P_SUCCESS; } /************************************************************************** * Lexer *************************************************************************/ /** Lexer state ID type. */ private alias lexer_state_id_t = <%= get_type_for(@lexer.state_table.size) %>; /** Invalid lexer state ID. */ private enum lexer_state_id_t INVALID_LEXER_STATE_ID = <%= @lexer.state_table.size %>u; /** Lexer user code ID type. */ <% user_code_id_count = (@grammar.patterns.map(&:code_id).compact.max || 0) + 1 %> private alias lexer_user_code_id_t = <%= get_type_for(user_code_id_count) %>; /** Invalid lexer user code ID. */ private enum lexer_user_code_id_t INVALID_USER_CODE_ID = <%= user_code_id_count %>u; /** * Lexer transition table entry. * * An incoming code point matching the range for a transition entry will cause * the lexer to progress to the destination state. */ private struct lexer_transition_t { /** First code point in the range for this transition. */ <%= @grammar.prefix %>code_point_t first; /** Last code point in the range for this transition. */ <%= @grammar.prefix %>code_point_t last; /** Destination lexer state ID for this transition. */ lexer_state_id_t destination_state; } /** Lexer state table entry. */ private struct lexer_state_t { /** Index to the transition table for this state. */ <%= get_type_for(@lexer.transition_table.size - 1) %> transition_table_index; /** Number of transition table entries for this state. */ <%= get_type_for(@lexer.state_table.map {|ste| ste[:n_transitions]}.max) %> n_transitions; /** Lexer token formed at this state. */ <%= @grammar.prefix %>token_t token; /** Lexer user code ID to execute at this state. */ lexer_user_code_id_t code_id; /** Whether this state matches a lexer pattern. */ bool accepts; } /** Lexer mode table entry. */ private struct lexer_mode_t { /** Offset in the state table to be used for this mode. */ uint state_table_offset; } /** * Lexer match info structure. * * This structure holds output values from the lexer upon a successful pattern * match. */ private struct lexer_match_info_t { /** Number of bytes of input text used to match. */ size_t length; /** Input text position delta. */ <%= @grammar.prefix %>position_t delta_position; /** Accepting lexer state from the match. */ const(lexer_state_t) * accepting_state; } /** Lexer transition table. */ private immutable lexer_transition_t[] lexer_transition_table = [ <% @lexer.transition_table.each do |transition_table_entry| %> lexer_transition_t(<%= transition_table_entry[:first] %>u, <%= transition_table_entry[:last] %>u, <%= transition_table_entry[:destination] %>u), <% end %> ]; /** Lexer state table. */ private immutable lexer_state_t[] lexer_state_table = [ <% @lexer.state_table.each do |state_table_entry| %> lexer_state_t(<%= state_table_entry[:transition_table_index] %>u, <%= state_table_entry[:n_transitions] %>u, <%= state_table_entry[:token] || "INVALID_TOKEN_ID" %>, <%= state_table_entry[:code_id] || "INVALID_USER_CODE_ID" %>, <%= state_table_entry[:accepts] %>), <% end %> ]; /** Lexer mode table. */ private immutable lexer_mode_t[] lexer_mode_table = [ <% @lexer.mode_table.each do |mode_table_entry| %> lexer_mode_t(<%= mode_table_entry[:state_table_offset] %>), <% end %> ]; /** * Execute user code associated with a lexer pattern. * * @param context * Lexer/parser context structure. * @param code_id * The ID of the user code block to execute. * @param match * Matched text for this pattern. * @param out_token_info * Lexer token info in progress. * * @return Token to accept, or invalid token if the user code does * not explicitly return a token. */ private <%= @grammar.prefix %>token_t lexer_user_code(<%= @grammar.prefix %>context_t * context, lexer_user_code_id_t code_id, string match, <%= @grammar.prefix %>token_info_t * out_token_info) { switch (code_id) { <% @grammar.patterns.each do |pattern| %> <% if pattern.code_id %> case <%= pattern.code_id %>u: { <%= expand_code(pattern.code, false, nil, pattern) %> } break; <% end %> <% end %> default: break; } return INVALID_TOKEN_ID; } /** * Check if there is a transition from the current lexer state to another * based on the given input code point. * * @param current_state * Current lexer state. * @param code_point * Input code point. * * @return Lexer state to transition to, or INVALID_LEXER_STATE_ID if none. */ private lexer_state_id_t check_lexer_transition(uint current_state, uint code_point) { uint transition_table_index = lexer_state_table[current_state].transition_table_index; for (uint i = 0u; i < lexer_state_table[current_state].n_transitions; i++) { if ((lexer_transition_table[transition_table_index + i].first <= code_point) && (code_point <= lexer_transition_table[transition_table_index + i].last)) { return lexer_transition_table[transition_table_index + i].destination_state; } } return INVALID_LEXER_STATE_ID; } /** * Find the longest lexer pattern match at the current position. * * @param context * Lexer/parser context structure. * @param[out] out_token_info * The lexed token information is stored here if the return value is * P_SUCCESS. * * @reval P_SUCCESS * A token was successfully lexed. * @reval P_DECODE_ERROR * The decoder encountered invalid text encoding. * @reval P_UNEXPECTED_INPUT * Input text does not match any lexer pattern. * @retval P_EOF * The end of the text input was reached. */ private size_t find_longest_match(<%= @grammar.prefix %>context_t * context, lexer_match_info_t * out_match_info, size_t * out_unexpected_input_length) { lexer_match_info_t longest_match; lexer_match_info_t attempt_match; *out_match_info = longest_match; uint current_state = lexer_mode_table[context.mode].state_table_offset; for (;;) { string input = context.input[(context.input_index + attempt_match.length)..(context.input.length)]; <%= @grammar.prefix %>code_point_t code_point; ubyte code_point_length; size_t result = <%= @grammar.prefix %>decode_code_point(input, &code_point, &code_point_length); switch (result) { case P_SUCCESS: lexer_state_id_t transition_state = check_lexer_transition(current_state, code_point); if (transition_state != INVALID_LEXER_STATE_ID) { attempt_match.length += code_point_length; if (code_point == '\n') { attempt_match.delta_position.row++; attempt_match.delta_position.col = 0u; } else { attempt_match.delta_position.col++; } current_state = transition_state; if (lexer_state_table[current_state].accepts) { attempt_match.accepting_state = &lexer_state_table[current_state]; longest_match = attempt_match; } } else if (longest_match.length > 0) { *out_match_info = longest_match; return P_SUCCESS; } else { *out_unexpected_input_length = attempt_match.length + code_point_length; return P_UNEXPECTED_INPUT; } break; case P_EOF: /* We hit EOF. */ if (longest_match.length > 0) { /* We have a match, so use it. */ *out_match_info = longest_match; return P_SUCCESS; } else if (attempt_match.length != 0) { /* There is a partial match - error! */ *out_unexpected_input_length = attempt_match.length; return P_UNEXPECTED_INPUT; } else { /* Valid EOF return. */ return P_EOF; } break; case P_DECODE_ERROR: /* If we see a decode error, we may be partially in the middle of * matching a pattern, so return the attempted match info so that * the input text position can be updated. */ *out_match_info = attempt_match; return result; default: return result; } } } /** * Attempt to lex the next token in the input stream. * * @param context * Lexer/parser context structure. * @param[out] out_token_info * The lexed token information is stored here if the return value is * P_SUCCESS. * * @reval P_SUCCESS * A token was successfully lexed. * @reval P_DECODE_ERROR * The decoder encountered invalid text encoding. * @reval P_UNEXPECTED_INPUT * Input text does not match any lexer pattern. * @retval P_DROP * A drop pattern was matched so the lexer should continue. */ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info) { <%= @grammar.prefix %>token_info_t token_info; token_info.position = context.text_position; token_info.token = INVALID_TOKEN_ID; *out_token_info = token_info; // TODO: remove lexer_match_info_t match_info; size_t unexpected_input_length; size_t result = find_longest_match(context, &match_info, &unexpected_input_length); switch (result) { case P_SUCCESS: <%= @grammar.prefix %>token_t token_to_accept = match_info.accepting_state.token; if (match_info.accepting_state.code_id != INVALID_USER_CODE_ID) { string match = context.input[context.input_index..(context.input_index + match_info.length)]; <%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context, match_info.accepting_state.code_id, match, &token_info); /* An invalid token returned from lexer_user_code() means that the * user code did not explicitly return a token. So only override * the token to return if the user code does explicitly return a * token. */ if (user_code_token != INVALID_TOKEN_ID) { token_to_accept = user_code_token; } } /* Update the input position tracking. */ context.input_index += match_info.length; context.text_position.row += match_info.delta_position.row; if (match_info.delta_position.row != 0u) { context.text_position.col = match_info.delta_position.col; } else { context.text_position.col += match_info.delta_position.col; } if (token_to_accept == INVALID_TOKEN_ID) { return P_DROP; } token_info.token = token_to_accept; token_info.length = match_info.length; *out_token_info = token_info; return P_SUCCESS; case P_EOF: token_info.token = TOKEN___EOF; *out_token_info = token_info; return P_SUCCESS; case P_DECODE_ERROR: /* Update the input position tracking. */ context.input_index += match_info.length; context.text_position.row += match_info.delta_position.row; if (match_info.delta_position.row != 0u) { context.text_position.col = match_info.delta_position.col; } else { context.text_position.col += match_info.delta_position.col; } return result; default: return result; } } /** * Lex the next token in the input stream. * * @param context * Lexer/parser context structure. * @param[out] out_token_info * The lexed token information is stored here if the return value is * P_SUCCESS. * * @reval P_SUCCESS * A token was successfully lexed. * @reval P_DECODE_ERROR * The decoder encountered invalid text encoding. * @reval P_UNEXPECTED_INPUT * Input text does not match any lexer pattern. */ public size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info) { for (;;) { size_t result = attempt_lex_token(context, out_token_info); if (result != P_DROP) { return result; } } } /************************************************************************** * Parser *************************************************************************/ /** Reduce ID type. */ private alias reduce_id_t = <%= get_type_for(@parser.reduce_table.size) %>; /** * A symbol ID can hold either a token ID or a rule set ID. * * Token IDs and rule set IDs share the same namespace, with rule set IDs * beginning after token IDs end. */ private alias symbol_id_t = <%= get_type_for(@parser.rule_sets.map(&:last).map(&:id).max) %>; /** Parser state ID type. */ private alias parser_state_id_t = <%= get_type_for(@parser.state_table.size) %>; /** Parser rule ID type. */ private alias rule_id_t = <%= get_type_for(@grammar.rules.size) %>; /** Parser shift ID type. */ private alias shift_id_t = <%= get_type_for(@parser.shift_table.size) %>; /** Shift table entry. */ private struct shift_t { /** Token or rule set ID. */ symbol_id_t symbol_id; /** Parser state to shift to. */ parser_state_id_t state_id; } /** Reduce table entry. */ private struct reduce_t { /** Lookahead token. */ <%= @grammar.prefix %>token_t token; /** * Rule ID. * * This is used to execute the parser user code block associated with a * grammar rule. */ rule_id_t rule; /** * Rule set ID. * * This is used as the new top symbol ID of the parse stack after this * reduce action. */ symbol_id_t rule_set; /** * Number of states leading to this reduce action. * * This is the number of entries popped from the parse stack after this * reduce action. */ parser_state_id_t n_states; } /** Parser state entry. */ private struct parser_state_t { /** First shift table entry for this parser state. */ shift_id_t shift_table_index; /** Number of shift table entries for this parser state. */ shift_id_t n_shift_entries; /** First reduce table entry for this parser state. */ reduce_id_t reduce_table_index; /** Number of reduce table entries for this parser state. */ reduce_id_t n_reduce_entries; } /** * Structure to hold a state ID and value pair. * * A stack of these structures makes up the parse stack. */ private struct state_value_t { /** Parser state ID. */ size_t state_id; /** Parser value from this state. */ <%= @grammar.prefix %>value_t pvalue; this(size_t state_id) { this.state_id = state_id; } } /** Parser shift table. */ private immutable shift_t[] parser_shift_table = [ <% @parser.shift_table.each do |shift| %> shift_t(<%= shift[:symbol_id] %>u, <%= shift[:state_id] %>u), <% end %> ]; /** Parser reduce table. */ private immutable reduce_t[] parser_reduce_table = [ <% @parser.reduce_table.each do |reduce| %> reduce_t(<%= reduce[:token_id] %>u, <%= reduce[:rule_id] %>u, <%= reduce[:rule_set_id] %>u, <%= reduce[:n_states] %>u), <% end %> ]; /** Parser state table. */ private immutable parser_state_t[] parser_state_table = [ <% @parser.state_table.each do |state| %> parser_state_t(<%= state[:shift_index] %>u, <%= state[:n_shifts] %>u, <%= state[:reduce_index] %>u, <%= state[:n_reduces] %>u), <% end %> ]; /** * Execute user code associated with a parser rule. * * @param rule The ID of the rule. * * @return Parse value. */ private <%= @grammar.prefix %>value_t parser_user_code(uint rule, state_value_t[] statevalues, uint n_states) { <%= @grammar.prefix %>value_t _pvalue; switch (rule) { <% @grammar.rules.each do |rule| %> <% if rule.code %> case <%= rule.id %>u: { <%= expand_code(rule.code, true, rule, nil) %> } break; <% end %> <% end %> default: break; } return _pvalue; } /** * Check if the parser should shift to a new state. * * @param state_id * Parser state ID. * @param symbol_id * Incoming token/rule set ID. * * @return State to shift to, or INVALID_ID if none. */ private size_t check_shift(size_t state_id, size_t symbol_id) { uint start = parser_state_table[state_id].shift_table_index; uint end = start + parser_state_table[state_id].n_shift_entries; for (uint i = start; i < end; i++) { if (parser_shift_table[i].symbol_id == symbol_id) { return parser_shift_table[i].state_id; } } return INVALID_ID; } /** * Check if the parser should reduce to a new state. * * @param state_id * Parser state ID. * @param token * Incoming token. * * @return State to reduce to, or INVALID_ID if none. */ private size_t check_reduce(size_t state_id, <%= @grammar.prefix %>token_t token) { size_t start = parser_state_table[state_id].reduce_table_index; size_t end = start + parser_state_table[state_id].n_reduce_entries; for (size_t i = start; i < end; i++) { if ((parser_reduce_table[i].token == token) || (parser_reduce_table[i].token == INVALID_TOKEN_ID)) { return i; } } return INVALID_ID; } /** * Run the parser. * * @param context * Lexer/parser context structure. * * @retval P_SUCCESS * The parser successfully matched the input text. The parse result value * can be accessed with <%= @grammar.prefix %>result(). * @retval P_UNEXPECTED_TOKEN * An unexpected token was encountered that does not match any grammar rule. * The value context.token holds the unexpected token. * @reval P_DECODE_ERROR * The decoder encountered invalid text encoding. * @reval P_UNEXPECTED_INPUT * Input text does not match any lexer pattern. */ public size_t <%= @grammar.prefix %>parse(<%= @grammar.prefix %>context_t * context) { <%= @grammar.prefix %>token_info_t token_info; <%= @grammar.prefix %>token_t token = INVALID_TOKEN_ID; state_value_t[] statevalues = new state_value_t[](1); size_t reduced_rule_set = INVALID_ID; <%= @grammar.prefix %>value_t reduced_parser_value; for (;;) { if (token == INVALID_TOKEN_ID) { size_t lexer_result = <%= @grammar.prefix %>lex(context, &token_info); if (lexer_result != P_SUCCESS) { return lexer_result; } token = token_info.token; } size_t shift_state = INVALID_ID; if (reduced_rule_set != INVALID_ID) { shift_state = check_shift(statevalues[$-1].state_id, reduced_rule_set); } if (shift_state == INVALID_ID) { shift_state = check_shift(statevalues[$-1].state_id, token); if ((shift_state != INVALID_ID) && (token == TOKEN___EOF)) { /* Successful parse. */ context.parse_result = statevalues[$-1].pvalue; return P_SUCCESS; } } if (shift_state != INVALID_ID) { /* We have something to shift. */ statevalues ~= state_value_t(shift_state); if (reduced_rule_set == INVALID_ID) { /* We shifted a token, mark it consumed. */ token = INVALID_TOKEN_ID; statevalues[$-1].pvalue = token_info.pvalue; } else { /* We shifted a RuleSet. */ statevalues[$-1].pvalue = reduced_parser_value; <%= @grammar.prefix %>value_t new_parse_result; reduced_parser_value = new_parse_result; reduced_rule_set = INVALID_ID; } continue; } size_t reduce_index = check_reduce(statevalues[$-1].state_id, token); if (reduce_index != INVALID_ID) { /* We have something to reduce. */ reduced_parser_value = parser_user_code(parser_reduce_table[reduce_index].rule, statevalues, parser_reduce_table[reduce_index].n_states); reduced_rule_set = parser_reduce_table[reduce_index].rule_set; statevalues.length -= parser_reduce_table[reduce_index].n_states; continue; } /* A token was successfully lexed, so the input text position was * advanced. However, this is an unexpected token, so we want to reset * the context text position to point to the token rather than the text * after it, so that if the caller wants to report the error position, * it will point to the correct position of the unexpected token. */ context.text_position = token_info.position; context.token = token; return P_UNEXPECTED_TOKEN; } } /** * Get the parse result value. * * @param context * Lexer/parser context structure. * * @return Parse result value. */ public <%= start_rule_type[1] %> <%= @grammar.prefix %>result(<%= @grammar.prefix %>context_t * context) { return context.parse_result.v_<%= start_rule_type[0] %>; } /** * Get the current text input position. * * @param context * Lexer/parser context structure. * * @return Current text position. */ public <%= @grammar.prefix %>position_t <%= @grammar.prefix %>position(<%= @grammar.prefix %>context_t * context) { return context.text_position; }