jes/src/core/Encoding.cc

245 lines
5.5 KiB
C++

#include "Encoding.h"
static inline bool
check_continuation_bytes(const uint8_t ** buffer,
uint8_t continuation_bytes_to_check)
{
for (uint8_t i = 0; i < continuation_bytes_to_check; i++)
{
if ((**buffer & 0xC0u) != 0x80u)
return false;
(*buffer)++;
}
return true;
}
Encoding::Type Encoding::detect_encoding(const uint8_t * buffer, size_t length)
{
for (size_t i = 0u; i < length; i++)
{
const uint8_t c = *buffer;
uint8_t continuation_bytes_to_check;
if ((c & 0x80u) == 0u)
{
buffer++;
continue;
}
else if ((c & 0xE0u) == 0xC0u)
{
continuation_bytes_to_check = 1u;
}
else if ((c & 0xF0u) == 0xE0u)
{
continuation_bytes_to_check = 2u;
}
else if ((c & 0xF8u) == 0xF0u)
{
continuation_bytes_to_check = 3u;
}
else if ((c & 0xFCu) == 0xF8u)
{
continuation_bytes_to_check = 4u;
}
else if ((c & 0xFEu) == 0xFCu)
{
continuation_bytes_to_check = 5u;
}
else
{
return CP_1252;
}
buffer++;
if (((i + continuation_bytes_to_check) < length) &&
check_continuation_bytes(&buffer, continuation_bytes_to_check))
{
i += continuation_bytes_to_check;
continue;
}
return CP_1252;
}
return UTF_8;
}
uint8_t Encoding::num_bytes_in_code_point(Type type, const uint8_t * encoded)
{
switch (type)
{
case UTF_8:
{
if ((*encoded & 0x80u) == 0u)
return 1u;
encoded++;
uint8_t n = 1u;
while ((*encoded++ & 0xC0u) == 0x80u)
n++;
return n;
}
break;
case CP_1252:
return 1u;
}
return 0u;
}
const uint8_t * Encoding::beginning_of_code_point(Type type, const uint8_t * encoded)
{
switch (type)
{
case UTF_8:
while ((*encoded & 0xC0u) == 0x80u)
encoded--;
return encoded;
case CP_1252:
return encoded;
}
return nullptr;
}
uint32_t Encoding::decode(Type type, const uint8_t * encoded, uint8_t * encoded_size)
{
uint32_t result = 0u;
uint8_t size = 0u;
switch (type)
{
case UTF_8:
{
const uint_fast8_t c = *encoded;
if ((c & 0x80u) == 0u)
{
result = c;
size = 1u;
}
else
{
uint_fast8_t following_bytes = 0u;
if ((c & 0xE0u) == 0xC0u)
{
result = c & 0x1Fu;
following_bytes = 1u;
}
else if ((c & 0xF0u) == 0xE0u)
{
result = c & 0x0Fu;
following_bytes = 2u;
}
else if ((c & 0xF8u) == 0xF0u)
{
result = c & 0x07u;
following_bytes = 3u;
}
else if ((c & 0xFCu) == 0xF8u)
{
result = c & 0x03u;
following_bytes = 4u;
}
else if ((c & 0xFEu) == 0xFCu)
{
result = c & 0x01u;
following_bytes = 5u;
}
size = following_bytes + 1u;
while (following_bytes-- > 0u)
{
encoded++;
result <<= 6u;
result |= *encoded & 0x3Fu;
}
}
}
break;
case CP_1252:
/* TODO: map byte to code point */
result = *encoded;
size = 1u;
break;
}
if (encoded_size != nullptr)
{
*encoded_size = size;
}
return result;
}
uint8_t Encoding::num_bytes_to_encode_code_point(uint32_t code_point, Type type)
{
switch (type)
{
case UTF_8:
if (code_point <= 0x7Fu)
{
return 1u;
}
else if (code_point <= 0x7FFu)
{
return 2u;
}
else if (code_point <= 0xFFFFu)
{
return 3u;
}
else if (code_point <= 0x1FFFFFu)
{
return 4u;
}
else if (code_point <= 0x3FFFFFFu)
{
return 5u;
}
else
{
return 6u;
}
break;
case CP_1252:
if (code_point <= 0xFFu)
{
return 1u;
}
else
{
return 0u;
}
break;
}
return 0u;
}
uint8_t Encoding::encode(uint32_t code_point, Type type, uint8_t * buffer)
{
switch (type)
{
case UTF_8:
{
uint8_t size = num_bytes_to_encode_code_point(code_point, type);
if (size == 1u)
{
*buffer = code_point;
}
else if (size > 1u)
{
*buffer = (0xFFu << (8u - size)) | (code_point >> (6u * (size - 1u)));
for (uint8_t i = 0u, end = size - 1u; i < end; i++)
{
buffer[end - i] = 0x80u | (code_point >> (6u * i) & 0x3Fu);
}
}
return size;
}
break;
case CP_1252:
if (code_point <= 0xFFu)
{
*buffer = code_point;
return 1u;
}
break;
}
return 0u;
}