Add FindLongestMatchResult

This commit is contained in:
Josh Holtrop 2023-06-04 21:14:07 -04:00
parent 6b0fb4cb12
commit 11348ca351

View File

@ -267,14 +267,6 @@ class <%= @classname %>
<% end %> <% end %>
]; ];
struct MatchInfo
{
size_t length;
size_t delta_row;
size_t delta_col;
const(State) * accepting_state;
}
struct Result struct Result
{ {
enum Type enum Type
@ -282,6 +274,7 @@ class <%= @classname %>
DECODE_ERROR, DECODE_ERROR,
DROP, DROP,
TOKEN, TOKEN,
UNEXPECTED_INPUT,
} }
Type type; Type type;
@ -349,18 +342,27 @@ class <%= @classname %>
result.row = m_input_row; result.row = m_input_row;
result.col = m_input_col; result.col = m_input_col;
result.token = _TOKEN_COUNT; result.token = _TOKEN_COUNT;
MatchInfo match_info; auto match_result = find_longest_match();
find_longest_match(&result, &match_info); if (match_result.is_eof())
if (result.token != _TOKEN_COUNT)
{ {
result.type = Result.Type.TOKEN;
result.token = TOKEN___EOF;
return result; return result;
} }
if (match_info.accepting_state != null) else if (match_result.is_decode_error())
{ {
uint token_to_accept = match_info.accepting_state.token; result.type = Result.Type.DECODE_ERROR;
if (match_info.accepting_state.code_id.is_valid()) return result;
}
else if (match_result.is_unexpected_input())
{ {
Token user_code_token = user_code(match_info.accepting_state.code_id, m_input[m_input_position..(m_input_position + match_info.length)], &result); result.type = Result.Type.UNEXPECTED_INPUT;
return result;
}
uint token_to_accept = match_result.accepting_state.token;
if (match_result.accepting_state.code_id.is_valid())
{
Token user_code_token = user_code(match_result.accepting_state.code_id, m_input[m_input_position..(m_input_position + match_result.length)], &result);
/* An invalid Token from user_code() means that the user /* An invalid Token from user_code() means that the user
* code did not explicitly return a token. So only override * code did not explicitly return a token. So only override
* the token to return if the user code does explicitly * the token to return if the user code does explicitly
@ -372,19 +374,19 @@ class <%= @classname %>
} }
/* Update the input position tracking. */ /* Update the input position tracking. */
m_input_position += match_info.length; m_input_position += match_result.length;
m_input_row += match_info.delta_row; m_input_row += match_result.delta_row;
if (match_info.delta_row != 0u) if (match_result.delta_row != 0u)
{ {
m_input_col = match_info.delta_col; m_input_col = match_result.delta_col;
} }
else else
{ {
m_input_col += match_info.delta_col; m_input_col += match_result.delta_col;
} }
result.token = token_to_accept; result.token = token_to_accept;
result.length = match_info.length; result.length = match_result.length;
if (match_info.accepting_state.drop) if (match_result.accepting_state.drop)
{ {
result.type = Result.Type.DROP; result.type = Result.Type.DROP;
} }
@ -392,56 +394,166 @@ class <%= @classname %>
{ {
result.type = Result.Type.TOKEN; result.type = Result.Type.TOKEN;
} }
}
return result; return result;
} }
private void find_longest_match(Result * result, MatchInfo * match_info) /**
* Result type for find_longest_match().
*
* Alternatives:
* - decode_error
* - eof
* - found_match(length, delta_row, delta_col, accepting_state)
* - unexpected_input(unexpected_input_length)
*/
struct FindLongestMatchResult
{ {
MatchInfo attempt_match_info; enum : ubyte
{
FOUND_MATCH,
DECODE_ERROR,
EOF,
UNEXPECTED_INPUT,
}
ubyte type;
alias type this;
union
{
struct
{
size_t length;
size_t delta_row;
size_t delta_col;
const(State) * accepting_state;
}
size_t unexpected_input_length;
}
this(ubyte type)
{
this.type = type;
}
this(ubyte type, size_t unexpected_input_length)
{
this.type = type;
this.unexpected_input_length = unexpected_input_length;
}
this(ubyte type, size_t length, size_t delta_row, size_t delta_col, const(State) * accepting_state)
{
this.type = type;
this.length = length;
this.delta_row = delta_row;
this.delta_col = delta_col;
this.accepting_state = accepting_state;
}
static FindLongestMatchResult found_match(size_t length, size_t delta_row, size_t delta_col, const(State) * accepting_state)
{
return FindLongestMatchResult(FOUND_MATCH, length, delta_row, delta_col, accepting_state);
}
static FindLongestMatchResult decode_error()
{
return FindLongestMatchResult(DECODE_ERROR);
}
static FindLongestMatchResult eof()
{
return FindLongestMatchResult(EOF);
}
static FindLongestMatchResult unexpected_input(size_t unexpected_input_length)
{
return FindLongestMatchResult(UNEXPECTED_INPUT, unexpected_input_length);
}
bool is_found_match()
{
return type == FOUND_MATCH;
}
bool is_decode_error()
{
return type == DECODE_ERROR;
}
bool is_eof()
{
return type == EOF;
}
bool is_unexpected_input()
{
return type == UNEXPECTED_INPUT;
}
}
private FindLongestMatchResult find_longest_match()
{
FindLongestMatchResult longest_match = FindLongestMatchResult.found_match(0, 0, 0, null);
FindLongestMatchResult attempt_match = longest_match;
uint current_state = modes[m_mode].state_table_offset; uint current_state = modes[m_mode].state_table_offset;
for (;;) for (;;)
{ {
auto decoded = Decoder.decode_code_point(m_input[(m_input_position + attempt_match_info.length)..(m_input.length)]); auto decoded = Decoder.decode_code_point(m_input[(m_input_position + attempt_match.length)..(m_input.length)]);
if (decoded.is_decode_error()) if (decoded.is_decode_error())
{ {
result.type = Result.Type.DECODE_ERROR; return FindLongestMatchResult.decode_error();
return;
} }
bool lex_continue = false; bool lex_continue = false;
if (!decoded.is_eof()) if (decoded.is_eof())
{
/* We hit EOF. */
if (longest_match.length > 0)
{
/* We have a match, so use it. */
return longest_match;
}
else if (attempt_match.length != 0)
{
/* There is a partial match - error! */
return FindLongestMatchResult.unexpected_input(attempt_match.length);
}
else
{
/* Valid EOF return. */
return FindLongestMatchResult.eof();
}
}
else
{ {
auto transition_result = transition(current_state, decoded.code_point); auto transition_result = transition(current_state, decoded.code_point);
if (transition_result.found()) if (transition_result.found())
{ {
lex_continue = true; lex_continue = true;
attempt_match_info.length += decoded.code_point_length; attempt_match.length += decoded.code_point_length;
if (decoded.code_point == '\n') if (decoded.code_point == '\n')
{ {
attempt_match_info.delta_row++; attempt_match.delta_row++;
attempt_match_info.delta_col = 0u; attempt_match.delta_col = 0u;
} }
else else
{ {
attempt_match_info.delta_col++; attempt_match.delta_col++;
} }
current_state = transition_result.destination(); current_state = transition_result.destination();
if (states[current_state].accepts()) if (states[current_state].accepts())
{ {
attempt_match_info.accepting_state = &states[current_state]; attempt_match.accepting_state = &states[current_state];
*match_info = attempt_match_info; longest_match = attempt_match;
} }
} }
} else if (longest_match.length > 0)
else if (attempt_match_info.length == 0u)
{ {
result.token = TOKEN___EOF; return longest_match;
result.type = Result.Type.TOKEN;
return;
} }
if (!lex_continue) else
{ {
return; return FindLongestMatchResult.unexpected_input(attempt_match.length + decoded.code_point_length);
}
} }
} }
} }