245 lines
5.5 KiB
C++
245 lines
5.5 KiB
C++
#include "Encoding.h"
|
|
|
|
static inline bool
|
|
check_continuation_bytes(const uint8_t ** buffer,
|
|
uint8_t continuation_bytes_to_check)
|
|
{
|
|
for (uint8_t i = 0; i < continuation_bytes_to_check; i++)
|
|
{
|
|
if ((**buffer & 0xC0u) != 0x80u)
|
|
return false;
|
|
(*buffer)++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Encoding::Type Encoding::detect_encoding(const uint8_t * buffer, size_t length)
|
|
{
|
|
for (size_t i = 0u; i < length; i++)
|
|
{
|
|
const uint8_t c = *buffer;
|
|
uint8_t continuation_bytes_to_check;
|
|
if ((c & 0x80u) == 0u)
|
|
{
|
|
buffer++;
|
|
continue;
|
|
}
|
|
else if ((c & 0xE0u) == 0xC0u)
|
|
{
|
|
continuation_bytes_to_check = 1u;
|
|
}
|
|
else if ((c & 0xF0u) == 0xE0u)
|
|
{
|
|
continuation_bytes_to_check = 2u;
|
|
}
|
|
else if ((c & 0xF8u) == 0xF0u)
|
|
{
|
|
continuation_bytes_to_check = 3u;
|
|
}
|
|
else if ((c & 0xFCu) == 0xF8u)
|
|
{
|
|
continuation_bytes_to_check = 4u;
|
|
}
|
|
else if ((c & 0xFEu) == 0xFCu)
|
|
{
|
|
continuation_bytes_to_check = 5u;
|
|
}
|
|
else
|
|
{
|
|
return CP_1252;
|
|
}
|
|
buffer++;
|
|
if (((i + continuation_bytes_to_check) < length) &&
|
|
check_continuation_bytes(&buffer, continuation_bytes_to_check))
|
|
{
|
|
i += continuation_bytes_to_check;
|
|
continue;
|
|
}
|
|
return CP_1252;
|
|
}
|
|
|
|
return UTF_8;
|
|
}
|
|
|
|
uint8_t Encoding::num_bytes_in_code_point(Type type, const uint8_t * encoded)
|
|
{
|
|
switch (type)
|
|
{
|
|
case UTF_8:
|
|
{
|
|
if ((*encoded & 0x80u) == 0u)
|
|
return 1u;
|
|
encoded++;
|
|
uint8_t n = 1u;
|
|
while ((*encoded++ & 0xC0u) == 0x80u)
|
|
n++;
|
|
return n;
|
|
}
|
|
break;
|
|
case CP_1252:
|
|
return 1u;
|
|
}
|
|
return 0u;
|
|
}
|
|
|
|
const uint8_t * Encoding::beginning_of_code_point(Type type, const uint8_t * encoded)
|
|
{
|
|
switch (type)
|
|
{
|
|
case UTF_8:
|
|
while ((*encoded & 0xC0u) == 0x80u)
|
|
encoded--;
|
|
return encoded;
|
|
case CP_1252:
|
|
return encoded;
|
|
}
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
uint32_t Encoding::decode(Type type, const uint8_t * encoded, uint8_t * encoded_size)
|
|
{
|
|
uint32_t result = 0u;
|
|
uint8_t size = 0u;
|
|
switch (type)
|
|
{
|
|
case UTF_8:
|
|
{
|
|
const uint_fast8_t c = *encoded;
|
|
if ((c & 0x80u) == 0u)
|
|
{
|
|
result = c;
|
|
size = 1u;
|
|
}
|
|
else
|
|
{
|
|
uint_fast8_t following_bytes = 0u;
|
|
if ((c & 0xE0u) == 0xC0u)
|
|
{
|
|
result = c & 0x1Fu;
|
|
following_bytes = 1u;
|
|
}
|
|
else if ((c & 0xF0u) == 0xE0u)
|
|
{
|
|
result = c & 0x0Fu;
|
|
following_bytes = 2u;
|
|
}
|
|
else if ((c & 0xF8u) == 0xF0u)
|
|
{
|
|
result = c & 0x07u;
|
|
following_bytes = 3u;
|
|
}
|
|
else if ((c & 0xFCu) == 0xF8u)
|
|
{
|
|
result = c & 0x03u;
|
|
following_bytes = 4u;
|
|
}
|
|
else if ((c & 0xFEu) == 0xFCu)
|
|
{
|
|
result = c & 0x01u;
|
|
following_bytes = 5u;
|
|
}
|
|
size = following_bytes + 1u;
|
|
while (following_bytes-- > 0u)
|
|
{
|
|
encoded++;
|
|
result <<= 6u;
|
|
result |= *encoded & 0x3Fu;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case CP_1252:
|
|
/* TODO: map byte to code point */
|
|
result = *encoded;
|
|
size = 1u;
|
|
break;
|
|
}
|
|
|
|
if (encoded_size != nullptr)
|
|
{
|
|
*encoded_size = size;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
uint8_t Encoding::num_bytes_to_encode_code_point(uint32_t code_point, Type type)
|
|
{
|
|
switch (type)
|
|
{
|
|
case UTF_8:
|
|
if (code_point <= 0x7Fu)
|
|
{
|
|
return 1u;
|
|
}
|
|
else if (code_point <= 0x7FFu)
|
|
{
|
|
return 2u;
|
|
}
|
|
else if (code_point <= 0xFFFFu)
|
|
{
|
|
return 3u;
|
|
}
|
|
else if (code_point <= 0x1FFFFFu)
|
|
{
|
|
return 4u;
|
|
}
|
|
else if (code_point <= 0x3FFFFFFu)
|
|
{
|
|
return 5u;
|
|
}
|
|
else
|
|
{
|
|
return 6u;
|
|
}
|
|
break;
|
|
case CP_1252:
|
|
if (code_point <= 0xFFu)
|
|
{
|
|
return 1u;
|
|
}
|
|
else
|
|
{
|
|
return 0u;
|
|
}
|
|
break;
|
|
}
|
|
|
|
return 0u;
|
|
}
|
|
|
|
uint8_t Encoding::encode(uint32_t code_point, Type type, uint8_t * buffer)
|
|
{
|
|
switch (type)
|
|
{
|
|
case UTF_8:
|
|
{
|
|
uint8_t size = num_bytes_to_encode_code_point(code_point, type);
|
|
if (size == 1u)
|
|
{
|
|
*buffer = code_point;
|
|
}
|
|
else if (size > 1u)
|
|
{
|
|
*buffer = (0xFFu << (8u - size)) | (code_point >> (6u * (size - 1u)));
|
|
for (uint8_t i = 0u, end = size - 1u; i < end; i++)
|
|
{
|
|
buffer[end - i] = 0x80u | (code_point >> (6u * i) & 0x3Fu);
|
|
}
|
|
}
|
|
return size;
|
|
}
|
|
break;
|
|
case CP_1252:
|
|
if (code_point <= 0xFFu)
|
|
{
|
|
*buffer = code_point;
|
|
return 1u;
|
|
}
|
|
break;
|
|
}
|
|
|
|
return 0u;
|
|
}
|