Add output parameter to Encoding::decode() to get encoded size

This commit is contained in:
Josh Holtrop 2016-12-17 10:54:50 -05:00
parent ee37616f82
commit 39088f6518
3 changed files with 64 additions and 48 deletions

View File

@ -97,64 +97,71 @@ const uint8_t * Encoding::beginning_of_code_point(Type type, const uint8_t * enc
return nullptr; return nullptr;
} }
uint32_t Encoding::decode(Type type, const uint8_t * encoded) uint32_t Encoding::decode(Type type, const uint8_t * encoded, uint8_t * encoded_size)
{ {
uint32_t result = 0u;
uint8_t size = 0u;
switch (type) switch (type)
{ {
case UTF_8: case UTF_8:
{ {
const uint8_t c = *encoded; const uint_fast8_t c = *encoded;
uint8_t following_bytes = 0u;
uint32_t v;
if ((c & 0x80u) == 0u) if ((c & 0x80u) == 0u)
{ {
return c; result = c;
} size = 1u;
else if ((c & 0xE0u) == 0xC0u)
{
v = c & 0x1Fu;
following_bytes = 1u;
}
else if ((c & 0xF0u) == 0xE0u)
{
v = c & 0x0Fu;
following_bytes = 2u;
}
else if ((c & 0xF8u) == 0xF0u)
{
v = c & 0x07u;
following_bytes = 3u;
}
else if ((c & 0xFCu) == 0xF8u)
{
v = c & 0x03u;
following_bytes = 4u;
}
else if ((c & 0xFEu) == 0xFCu)
{
v = c & 0x01u;
following_bytes = 5u;
} }
else else
{ {
return 0u; uint_fast8_t following_bytes = 0u;
if ((c & 0xE0u) == 0xC0u)
{
result = c & 0x1Fu;
following_bytes = 1u;
}
else if ((c & 0xF0u) == 0xE0u)
{
result = c & 0x0Fu;
following_bytes = 2u;
}
else if ((c & 0xF8u) == 0xF0u)
{
result = c & 0x07u;
following_bytes = 3u;
}
else if ((c & 0xFCu) == 0xF8u)
{
result = c & 0x03u;
following_bytes = 4u;
}
else if ((c & 0xFEu) == 0xFCu)
{
result = c & 0x01u;
following_bytes = 5u;
}
size = following_bytes + 1u;
while (following_bytes-- > 0u)
{
encoded++;
result <<= 6u;
result |= *encoded & 0x3Fu;
}
} }
while (following_bytes-- > 0u)
{
encoded++;
v <<= 6u;
v |= *encoded & 0x3Fu;
}
return v;
} }
break; break;
case CP_1252: case CP_1252:
/* TODO: map byte to code point */ /* TODO: map byte to code point */
return *encoded; result = *encoded;
size = 1u;
break; break;
} }
return 0u; if (encoded_size != nullptr)
{
*encoded_size = size;
}
return result;
} }
uint8_t Encoding::num_bytes_to_encode_code_point(uint32_t code_point, Type type) uint8_t Encoding::num_bytes_to_encode_code_point(uint32_t code_point, Type type)

View File

@ -21,7 +21,7 @@ public:
static Type detect_encoding(const uint8_t * buffer, size_t length); static Type detect_encoding(const uint8_t * buffer, size_t length);
static uint8_t num_bytes_in_code_point(Type type, const uint8_t * encoded); static uint8_t num_bytes_in_code_point(Type type, const uint8_t * encoded);
static const uint8_t * beginning_of_code_point(Type type, const uint8_t * encoded); static const uint8_t * beginning_of_code_point(Type type, const uint8_t * encoded);
static uint32_t decode(Type type, const uint8_t * encoded); static uint32_t decode(Type type, const uint8_t * encoded, uint8_t * encoded_size = nullptr);
static uint8_t num_bytes_to_encode_code_point(uint32_t code_point, Type type); static uint8_t num_bytes_to_encode_code_point(uint32_t code_point, Type type);
static uint8_t encode(uint32_t code_point, Type type, uint8_t * buffer); static uint8_t encode(uint32_t code_point, Type type, uint8_t * buffer);
}; };

View File

@ -55,17 +55,26 @@ TEST(Encoding_beginning_of_code_point, returns_pointer_to_beginning_of_code_poin
TEST(Encoding_decode, decodes_UTF_8_correctly) TEST(Encoding_decode, decodes_UTF_8_correctly)
{ {
EXPECT_EQ((uint32_t)'%', Encoding::decode(Encoding::UTF_8, (const uint8_t *)"%")); uint8_t size = 0xFFu;
EXPECT_EQ(0x42u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xC1\x82")); EXPECT_EQ((uint32_t)'%', Encoding::decode(Encoding::UTF_8, (const uint8_t *)"%", &size));
EXPECT_EQ(0x1083u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xE1\x82\x83")); EXPECT_EQ(1u, size);
EXPECT_EQ(0x420C4u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xF1\x82\x83\x84")); EXPECT_EQ(0x42u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xC1\x82", &size));
EXPECT_EQ(0x1083105u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xF9\x82\x83\x84\x85")); EXPECT_EQ(2u, size);
EXPECT_EQ(0x420C4146u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xFD\x82\x83\x84\x85\x86")); EXPECT_EQ(0x1083u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xE1\x82\x83", &size));
EXPECT_EQ(3u, size);
EXPECT_EQ(0x420C4u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xF1\x82\x83\x84", &size));
EXPECT_EQ(4u, size);
EXPECT_EQ(0x1083105u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xF9\x82\x83\x84\x85", &size));
EXPECT_EQ(5u, size);
EXPECT_EQ(0x420C4146u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xFD\x82\x83\x84\x85\x86", &size));
EXPECT_EQ(6u, size);
} }
TEST(Encoding_decode, decodes_CP_1252_correctly) TEST(Encoding_decode, decodes_CP_1252_correctly)
{ {
EXPECT_EQ(0x99u, Encoding::decode(Encoding::CP_1252, (const uint8_t *)"\x99")); uint8_t size;
EXPECT_EQ(0x99u, Encoding::decode(Encoding::CP_1252, (const uint8_t *)"\x99", &size));
EXPECT_EQ(1u, size);
} }