Add lexer modes and $mode() code expansion

This commit is contained in:
Josh Holtrop 2022-10-09 22:49:01 -04:00
parent b2d11321fe
commit 02be6de48e
6 changed files with 145 additions and 38 deletions

View File

@ -124,14 +124,19 @@ class <%= @classname %>
uint code_id;
}
<% transition_table, state_table = @lexer.build_tables %>
private struct Mode
{
uint state_table_offset;
}
<% transition_table, state_table, mode_table = @lexer.build_tables %>
private static immutable Transition transitions[] = [
<% transition_table.each do |transition_table_entry| %>
Transition(<%= transition_table_entry[:first] %>u, <%= transition_table_entry[:last] %>u, <%= transition_table_entry[:destination] %>u),
<% end %>
];
private static const State states[] = [
private static immutable State states[] = [
<% state_table.each do |state_table_entry| %>
State(<%= state_table_entry[:transition_table_index] %>u,
<%= state_table_entry[:n_transitions] %>u,
@ -140,6 +145,12 @@ class <%= @classname %>
<% end %>
];
private static immutable Mode modes[] = [
<% mode_table.each do |mode_table_entry| %>
Mode(<%= mode_table_entry[:state_table_offset] %>),
<% end %>
];
struct LexedToken
{
size_t row;
@ -153,11 +164,13 @@ class <%= @classname %>
private size_t m_input_position;
private size_t m_input_row;
private size_t m_input_col;
private size_t m_mode;
this(const(ubyte) * input, size_t input_length)
{
m_input = input;
m_input_length = input_length;
m_mode = <%= @lexer.mode_id("default") %>;
}
LexedToken lex_token()
@ -211,7 +224,7 @@ class <%= @classname %>
MatchInfo longest_match_info;
longest_match_info.token = _TOKEN_COUNT;
MatchInfo attempt_match_info;
uint current_state;
uint current_state = modes[m_mode].state_table_offset;
for (;;)
{
auto decoded = Decoder.decode_code_point(&m_input[m_input_position + attempt_match_info.length], m_input_length - m_input_position - attempt_match_info.length);

View File

@ -26,6 +26,17 @@ class Propane
private
def process_grammar!
# Assign default pattern mode to patterns without a mode assigned.
found_default = false
@grammar.patterns.each do |pattern|
if pattern.mode.nil?
pattern.mode = "default"
found_default = true
end
end
unless found_default
raise Error.new("No patterns found for default mode")
end
# Add EOF token.
@grammar.tokens << Token.new("$EOF", nil)
tokens_by_name = {}
@ -152,6 +163,13 @@ class Propane
def expand_code(code)
code.gsub(/\$token\(([$\w]+)\)/) do |match|
"TOKEN_#{Token.code_name($1)}"
end.gsub(/\$mode\(([a-zA-Z_][a-zA-Z_0-9]*)\)/) do |match|
mode_name = $1
mode_id = @lexer.mode_id(mode_name)
unless mode_id
raise Error.new("Lexer mode '#{mode_name}' not found")
end
"m_mode = #{mode_id}u"
end
end

View File

@ -1,51 +1,74 @@
class Propane
class Lexer
# @return [DFA]
# Lexer DFA.
attr_accessor :dfa
def initialize(grammar)
@grammar = grammar
@dfa = DFA.new(grammar.patterns)
end
def build_tables
@modes = @grammar.patterns.group_by do |pattern|
pattern.mode
end.transform_values do |patterns|
{dfa: DFA.new(patterns)}
end
@modes.each_with_index do |(mode_name, mode_info), index|
mode_info[:id] = index
end
transition_table = []
state_table = []
states = @dfa.enumerate
states.each do |state, id|
token =
if state.accepts.nil?
@grammar.tokens.size
elsif state.accepts.drop?
TOKEN_DROP
elsif state.accepts.token
state.accepts.token.id
else
@grammar.tokens.size
end
code_id =
if state.accepts && state.accepts.code_id
state.accepts.code_id
else
0xFFFF_FFFF
end
state_table << {
transition_table_index: transition_table.size,
n_transitions: state.transitions.size,
token: token,
code_id: code_id,
mode_table = []
@modes.each do |mode_name, mode_info|
state_table_offset = state_table.size
mode_table << {
state_table_offset: state_table_offset,
}
state.transitions.each do |transition|
transition_table << {
first: transition.code_point_range.first,
last: transition.code_point_range.last,
destination: states[transition.destination],
states = mode_info[:dfa].enumerate
states.each do |state, id|
token =
if state.accepts.nil?
@grammar.tokens.size
elsif state.accepts.drop?
TOKEN_DROP
elsif state.accepts.token
state.accepts.token.id
else
@grammar.tokens.size
end
code_id =
if state.accepts && state.accepts.code_id
state.accepts.code_id
else
0xFFFF_FFFF
end
state_table << {
transition_table_index: transition_table.size,
n_transitions: state.transitions.size,
token: token,
code_id: code_id,
}
state.transitions.each do |transition|
transition_table << {
first: transition.code_point_range.first,
last: transition.code_point_range.last,
destination: states[transition.destination] + state_table_offset,
}
end
end
end
[transition_table, state_table]
[transition_table, state_table, mode_table]
end
# Get ID for a mode.
#
# @param mode_name [String]
# Mode name.
#
# @return [Integer, nil]
# Mode ID.
def mode_id(mode_name)
if mode_info = @modes[mode_name]
mode_info[:id]
end
end
end

View File

@ -28,7 +28,7 @@ class Propane
# @return [String, nil]
# Lexer mode for this pattern.
attr_reader :mode
attr_accessor :mode
# Construct a Pattern.
#

View File

@ -192,4 +192,37 @@ EOF
"def!",
])
end
it "supports lexer modes" do
write_grammar <<EOF
token abc;
token def;
tokenid string;
drop /\\s+/;
/"/ <<
writeln("begin string mode");
$mode(string);
>>
string: /[^"]+/ <<
writeln("captured string");
>>
string: /"/ <<
$mode(default);
return $token(string);
>>
Start -> abc string def;
EOF
build_parser
compile("spec/test_lexer_modes.d")
results = run
expect(results.status).to eq 0
verify_lines(results.stdout, [
"begin string mode",
"captured string",
"pass1",
"begin string mode",
"captured string",
"pass2",
])
end
end

20
spec/test_lexer_modes.d Normal file
View File

@ -0,0 +1,20 @@
import testparser;
import std.stdio;
int main()
{
return 0;
}
unittest
{
string input = `abc "a string" def`;
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
assert(parser.parse() == true);
writeln("pass1");
input = `abc "abc def" def`;
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
assert(parser.parse() == true);
writeln("pass2");
}