From 39088f651880f7eebbe37b92f33a322bc4e81626 Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Sat, 17 Dec 2016 10:54:50 -0500 Subject: [PATCH] Add output parameter to Encoding::decode() to get encoded size --- src/core/Encoding.cc | 87 +++++++++++++++++++++------------------ src/core/Encoding.h | 2 +- test/src/test_Encoding.cc | 23 +++++++---- 3 files changed, 64 insertions(+), 48 deletions(-) diff --git a/src/core/Encoding.cc b/src/core/Encoding.cc index 959b233..10bdd9f 100644 --- a/src/core/Encoding.cc +++ b/src/core/Encoding.cc @@ -97,64 +97,71 @@ const uint8_t * Encoding::beginning_of_code_point(Type type, const uint8_t * enc return nullptr; } -uint32_t Encoding::decode(Type type, const uint8_t * encoded) +uint32_t Encoding::decode(Type type, const uint8_t * encoded, uint8_t * encoded_size) { + uint32_t result = 0u; + uint8_t size = 0u; switch (type) { case UTF_8: { - const uint8_t c = *encoded; - uint8_t following_bytes = 0u; - uint32_t v; + const uint_fast8_t c = *encoded; if ((c & 0x80u) == 0u) { - return c; - } - else if ((c & 0xE0u) == 0xC0u) - { - v = c & 0x1Fu; - following_bytes = 1u; - } - else if ((c & 0xF0u) == 0xE0u) - { - v = c & 0x0Fu; - following_bytes = 2u; - } - else if ((c & 0xF8u) == 0xF0u) - { - v = c & 0x07u; - following_bytes = 3u; - } - else if ((c & 0xFCu) == 0xF8u) - { - v = c & 0x03u; - following_bytes = 4u; - } - else if ((c & 0xFEu) == 0xFCu) - { - v = c & 0x01u; - following_bytes = 5u; + result = c; + size = 1u; } else { - return 0u; + uint_fast8_t following_bytes = 0u; + if ((c & 0xE0u) == 0xC0u) + { + result = c & 0x1Fu; + following_bytes = 1u; + } + else if ((c & 0xF0u) == 0xE0u) + { + result = c & 0x0Fu; + following_bytes = 2u; + } + else if ((c & 0xF8u) == 0xF0u) + { + result = c & 0x07u; + following_bytes = 3u; + } + else if ((c & 0xFCu) == 0xF8u) + { + result = c & 0x03u; + following_bytes = 4u; + } + else if ((c & 0xFEu) == 0xFCu) + { + result = c & 0x01u; + following_bytes = 5u; + } + size = following_bytes + 1u; + while (following_bytes-- > 0u) + { + encoded++; + result <<= 6u; + result |= *encoded & 0x3Fu; + } } - while (following_bytes-- > 0u) - { - encoded++; - v <<= 6u; - v |= *encoded & 0x3Fu; - } - return v; } break; case CP_1252: /* TODO: map byte to code point */ - return *encoded; + result = *encoded; + size = 1u; break; } - return 0u; + if (encoded_size != nullptr) + { + *encoded_size = size; + } + + return result; } uint8_t Encoding::num_bytes_to_encode_code_point(uint32_t code_point, Type type) diff --git a/src/core/Encoding.h b/src/core/Encoding.h index 617f302..9b2c901 100644 --- a/src/core/Encoding.h +++ b/src/core/Encoding.h @@ -21,7 +21,7 @@ public: static Type detect_encoding(const uint8_t * buffer, size_t length); static uint8_t num_bytes_in_code_point(Type type, const uint8_t * encoded); static const uint8_t * beginning_of_code_point(Type type, const uint8_t * encoded); - static uint32_t decode(Type type, const uint8_t * encoded); + static uint32_t decode(Type type, const uint8_t * encoded, uint8_t * encoded_size = nullptr); static uint8_t num_bytes_to_encode_code_point(uint32_t code_point, Type type); static uint8_t encode(uint32_t code_point, Type type, uint8_t * buffer); }; diff --git a/test/src/test_Encoding.cc b/test/src/test_Encoding.cc index d6deb06..d37ec3d 100644 --- a/test/src/test_Encoding.cc +++ b/test/src/test_Encoding.cc @@ -55,17 +55,26 @@ TEST(Encoding_beginning_of_code_point, returns_pointer_to_beginning_of_code_poin TEST(Encoding_decode, decodes_UTF_8_correctly) { - EXPECT_EQ((uint32_t)'%', Encoding::decode(Encoding::UTF_8, (const uint8_t *)"%")); - EXPECT_EQ(0x42u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xC1\x82")); - EXPECT_EQ(0x1083u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xE1\x82\x83")); - EXPECT_EQ(0x420C4u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xF1\x82\x83\x84")); - EXPECT_EQ(0x1083105u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xF9\x82\x83\x84\x85")); - EXPECT_EQ(0x420C4146u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xFD\x82\x83\x84\x85\x86")); + uint8_t size = 0xFFu; + EXPECT_EQ((uint32_t)'%', Encoding::decode(Encoding::UTF_8, (const uint8_t *)"%", &size)); + EXPECT_EQ(1u, size); + EXPECT_EQ(0x42u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xC1\x82", &size)); + EXPECT_EQ(2u, size); + EXPECT_EQ(0x1083u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xE1\x82\x83", &size)); + EXPECT_EQ(3u, size); + EXPECT_EQ(0x420C4u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xF1\x82\x83\x84", &size)); + EXPECT_EQ(4u, size); + EXPECT_EQ(0x1083105u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xF9\x82\x83\x84\x85", &size)); + EXPECT_EQ(5u, size); + EXPECT_EQ(0x420C4146u, Encoding::decode(Encoding::UTF_8, (const uint8_t *)"\xFD\x82\x83\x84\x85\x86", &size)); + EXPECT_EQ(6u, size); } TEST(Encoding_decode, decodes_CP_1252_correctly) { - EXPECT_EQ(0x99u, Encoding::decode(Encoding::CP_1252, (const uint8_t *)"\x99")); + uint8_t size; + EXPECT_EQ(0x99u, Encoding::decode(Encoding::CP_1252, (const uint8_t *)"\x99", &size)); + EXPECT_EQ(1u, size); }