Work on real D strings instead of ubyte pointer and length

Also fix a couple UTF-8 decoder bugs!
This commit is contained in:
Josh Holtrop 2022-10-15 13:32:01 -04:00
parent de93d23585
commit 623c644e74
11 changed files with 43 additions and 77 deletions

View File

@ -41,13 +41,13 @@ class <%= @classname %>
uint code_point_length;
}
static DecodedCodePoint decode_code_point(const(ubyte) * input, size_t input_length)
static DecodedCodePoint decode_code_point(string input)
{
if (input_length == 0u)
if (input.length == 0u)
{
return DecodedCodePoint(CODE_POINT_EOF, 0u);
}
ubyte c = *input;
char c = input[0];
uint code_point;
uint code_point_length;
if ((c & 0x80u) == 0u)
@ -87,20 +87,19 @@ class <%= @classname %>
{
return DecodedCodePoint(CODE_POINT_INVALID, 0u);
}
if (input_length <= following_bytes)
if (input.length <= following_bytes)
{
return DecodedCodePoint(CODE_POINT_INVALID, 0u);
}
code_point_length = following_bytes + 1u;
while (following_bytes-- > 0u)
for (size_t i = 0u; i < following_bytes; i++)
{
input++;
ubyte b = *input;
if ((b & 0xC0u) != 0u)
char b = input[i + 1u];
if ((b & 0xC0u) != 0x80u)
{
return DecodedCodePoint(CODE_POINT_INVALID, 0u);
}
code_point = (code_point << 6u) | b;
code_point = (code_point << 6u) | (b & 0x3Fu);
}
}
return DecodedCodePoint(code_point, code_point_length);
@ -159,17 +158,15 @@ class <%= @classname %>
uint token;
}
private const(ubyte) * m_input;
private size_t m_input_length;
private string m_input;
private size_t m_input_position;
private size_t m_input_row;
private size_t m_input_col;
private size_t m_mode;
this(const(ubyte) * input, size_t input_length)
this(string input)
{
m_input = input;
m_input_length = input_length;
m_mode = <%= @lexer.mode_id("default") %>;
}
@ -227,7 +224,7 @@ class <%= @classname %>
uint current_state = modes[m_mode].state_table_offset;
for (;;)
{
auto decoded = Decoder.decode_code_point(&m_input[m_input_position + attempt_match_info.length], m_input_length - m_input_position - attempt_match_info.length);
auto decoded = Decoder.decode_code_point(m_input[(m_input_position + attempt_match_info.length)..(m_input.length)]);
if (decoded.code_point == Decoder.CODE_POINT_INVALID)
{
lt.token = _TOKEN_DECODE_ERROR;
@ -377,9 +374,9 @@ class <%= @classname %>
private <%= @grammar.result_type %> parse_result;
this(const(ubyte) * input, size_t input_length)
this(string input)
{
m_lexer = new Lexer(input, input_length);
m_lexer = new Lexer(input);
}
bool parse()

View File

@ -76,6 +76,7 @@ EOF
build_parser
compile("spec/test_d_lexer.d")
results = run
expect(results.stderr).to eq ""
expect(results.status).to eq 0
end

View File

@ -9,59 +9,27 @@ int main()
unittest
{
alias DCP = Testparser.Decoder.DecodedCodePoint;
string inputstring = "5+\n 66";
const(ubyte) * input = cast(const(ubyte) *)inputstring.ptr;
size_t input_length = inputstring.length;
DCP dcp;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
dcp = Testparser.Decoder.decode_code_point("5");
assert(dcp == DCP('5', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('+', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('\n', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP(' ', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('6', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('6', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
dcp = Testparser.Decoder.decode_code_point("");
assert(dcp == DCP(Testparser.Decoder.CODE_POINT_EOF, 0u));
inputstring = "\xf0\x1f\x27\x21";
input = cast(const(ubyte) *)inputstring.ptr;
input_length = inputstring.length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
dcp = Testparser.Decoder.decode_code_point("\xC2\xA9");
assert(dcp == DCP(0xA9u, 2u));
dcp = Testparser.Decoder.decode_code_point("\xf0\x9f\xa7\xa1");
assert(dcp == DCP(0x1F9E1, 4u));
inputstring = "\xf0\x1f\x27";
input = cast(const(ubyte) *)inputstring.ptr;
input_length = inputstring.length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
dcp = Testparser.Decoder.decode_code_point("\xf0\x9f\x27");
assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u));
inputstring = "\xf0\x1f\x27\xFF";
input = cast(const(ubyte) *)inputstring.ptr;
input_length = inputstring.length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
dcp = Testparser.Decoder.decode_code_point("\xf0\x9f\xa7\xFF");
assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u));
inputstring = "\xfe";
input = cast(const(ubyte) *)inputstring.ptr;
input_length = inputstring.length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
dcp = Testparser.Decoder.decode_code_point("\xfe");
assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u));
}
@ -69,7 +37,7 @@ unittest
{
alias LT = Testparser.Lexer.LexedToken;
string input = "5 + 4 * \n677 + 567";
Testparser.Lexer lexer = new Testparser.Lexer(cast(const(ubyte) *)input.ptr, input.length);
Testparser.Lexer lexer = new Testparser.Lexer(input);
assert(lexer.lex_token() == LT(0, 0, 1, Testparser.TOKEN_int));
assert(lexer.lex_token() == LT(0, 2, 1, Testparser.TOKEN_plus));
assert(lexer.lex_token() == LT(0, 4, 1, Testparser.TOKEN_int));
@ -79,6 +47,6 @@ unittest
assert(lexer.lex_token() == LT(1, 6, 3, Testparser.TOKEN_int));
assert(lexer.lex_token() == LT(1, 9, 0, Testparser.TOKEN_0EOF));
lexer = new Testparser.Lexer(null, 0u);
lexer = new Testparser.Lexer("");
assert(lexer.lex_token() == LT(0, 0, 0, Testparser.TOKEN_0EOF));
}

View File

@ -9,10 +9,10 @@ int main()
unittest
{
string input = "aba";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
auto parser = new Testparser.Parser(input);
assert(parser.parse() == true);
input = "abb";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
parser = new Testparser.Parser(input);
assert(parser.parse() == true);
}

View File

@ -9,14 +9,14 @@ int main()
unittest
{
string input = "a";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
auto parser = new Testparser.Parser(input);
assert(parser.parse() == false);
input = "a b";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
parser = new Testparser.Parser(input);
assert(parser.parse() == true);
input = "bb";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
parser = new Testparser.Parser(input);
assert(parser.parse() == true);
}

View File

@ -9,12 +9,12 @@ int main()
unittest
{
string input = `abc "a string" def`;
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
auto parser = new Testparser.Parser(input);
assert(parser.parse() == true);
writeln("pass1");
input = `abc "abc def" def`;
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
parser = new Testparser.Parser(input);
assert(parser.parse() == true);
writeln("pass2");
}

View File

@ -9,6 +9,6 @@ int main()
unittest
{
string input = "ab";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
auto parser = new Testparser.Parser(input);
assert(parser.parse() == true);
}

View File

@ -9,17 +9,17 @@ int main()
unittest
{
string input = "a";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
auto parser = new Testparser.Parser(input);
assert(parser.parse() == true);
assert(parser.result == 1u);
input = "";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
parser = new Testparser.Parser(input);
assert(parser.parse() == true);
assert(parser.result == 0u);
input = "aaaaaaaaaaaaaaaa";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
parser = new Testparser.Parser(input);
assert(parser.parse() == true);
assert(parser.result == 16u);
}

View File

@ -9,12 +9,12 @@ int main()
unittest
{
string input = "abcdef";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
auto parser = new Testparser.Parser(input);
assert(parser.parse() == true);
writeln("pass1");
input = "defabcdef";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
parser = new Testparser.Parser(input);
assert(parser.parse() == true);
writeln("pass2");
}

View File

@ -9,6 +9,6 @@ int main()
unittest
{
string input = "defghidef";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
auto parser = new Testparser.Parser(input);
assert(parser.parse() == true);
}

View File

@ -9,12 +9,12 @@ int main()
unittest
{
string input = "abcdef";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
auto parser = new Testparser.Parser(input);
assert(parser.parse() == true);
writeln("pass1");
input = "abcabcdef";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
parser = new Testparser.Parser(input);
assert(parser.parse() == true);
writeln("pass2");
}