propane/spec/propane/lexer/dfa_spec.rb

class TestLexer
  def initialize(token_dfa)
    @token_dfa = token_dfa
  end

  def lex(input)
    input_chars = input.chars
    output = []
    while lexed_token = lex_token(input_chars)
      output << lexed_token
      input_chars.slice!(0, lexed_token[1].size)
    end
    unless input_chars.empty?
      raise "Unmatched input #{input_chars.join(" ")}"
    end
    output
  end

  def lex_token(input_chars)
    return nil if input_chars.empty?
    s = ""
    current_state = @token_dfa.start_state
    last_accepts = nil
    last_s = nil
    input_chars.each_with_index do |input_char, index|
      if next_state = transition(current_state, input_char)
        s += input_char
        current_state = next_state
        if current_state.accepts
          last_accepts = current_state.accepts
          last_s = s
        end
      else
        break
      end
    end
    if last_accepts
      name = last_accepts.token ? last_accepts.token.name : nil
      [name, last_s]
    end
  end

  def transition(state, input_char)
    state.transitions.each do |transition|
      if transition.code_point_range.include?(input_char.ord)
        return transition.destination
      end
    end
    nil
  end
end

def run(grammar, input)
  grammar = Propane::Grammar.new(grammar)
  token_dfa = Propane::Lexer::DFA.new(grammar.patterns)
  test_lexer = TestLexer.new(token_dfa)
  test_lexer.lex(input)
end

describe Propane::Lexer::DFA do
  it "lexes a simple token" do
    expect(run(<<EOF, "foo")).to eq [["foo", "foo"]]
token foo;
EOF
  end

  it "lexes two tokens" do
    expected = [
      ["foo", "foo"],
      ["bar", "bar"],
    ]
    expect(run(<<EOF, "foobar")).to eq expected
token foo;
token bar;
EOF
  end

  it "lexes the longer of multiple options" do
    expected = [
      ["identifier", "foobar"],
    ]
    expect(run(<<EOF, "foobar")).to eq expected
token foo;
token bar;
token identifier /[a-z]+/;
EOF
    expected = [
      ["plusplus", "++"],
      ["plus", "+"],
    ]
    expect(run(<<EOF, "+++")).to eq expected
token plus /\\+/;
token plusplus /\\+\\+/;
EOF
  end

  it "lexes whitespace" do
    expected = [
      ["foo", "foo"],
      ["WS", " \t"],
      ["bar", "bar"],
    ]
    expect(run(<<EOF, "foo \tbar")).to eq expected
token foo;
token bar;
token WS /\\s+/;
EOF
  end

  it "allows dropping a matched pattern" do
    expected = [
      ["foo", "foo"],
      [nil, " \t"],
      ["bar", "bar"],
    ]
    expect(run(<<EOF, "foo \tbar")).to eq expected
token foo;
token bar;
drop /\\s+/;
EOF
  end

  it "matches a semicolon" do
    expected = [
      ["semicolon", ";"],
    ]
    expect(run(<<EOF, ";")).to eq expected
token semicolon /;/;
EOF
  end
end