#include "Encoding.h" static inline bool check_continuation_bytes(const uint8_t ** buffer, uint8_t continuation_bytes_to_check) { for (uint8_t i = 0; i < continuation_bytes_to_check; i++) { if ((**buffer & 0xC0u) != 0x80u) return false; (*buffer)++; } return true; } Encoding::Type Encoding::detect_encoding(const uint8_t * buffer, size_t length) { for (size_t i = 0u; i < length; i++) { const uint8_t c = *buffer; uint8_t continuation_bytes_to_check; if ((c & 0x80u) == 0u) { buffer++; continue; } else if ((c & 0xE0u) == 0xC0u) { continuation_bytes_to_check = 1u; } else if ((c & 0xF0u) == 0xE0u) { continuation_bytes_to_check = 2u; } else if ((c & 0xF8u) == 0xF0u) { continuation_bytes_to_check = 3u; } else if ((c & 0xFCu) == 0xF8u) { continuation_bytes_to_check = 4u; } else if ((c & 0xFEu) == 0xFCu) { continuation_bytes_to_check = 5u; } else { return CP_1252; } buffer++; if (((i + continuation_bytes_to_check) < length) && check_continuation_bytes(&buffer, continuation_bytes_to_check)) { i += continuation_bytes_to_check; continue; } return CP_1252; } return UTF_8; } uint8_t Encoding::num_bytes_in_code_point(Type type, const uint8_t * encoded) { switch (type) { case UTF_8: { if ((*encoded & 0x80u) == 0u) return 1u; encoded++; uint8_t n = 1u; while ((*encoded++ & 0xC0u) == 0x80u) n++; return n; } break; case CP_1252: return 1u; } return 0u; } const uint8_t * Encoding::beginning_of_code_point(Type type, const uint8_t * encoded) { switch (type) { case UTF_8: while ((*encoded & 0xC0u) == 0x80u) encoded--; return encoded; case CP_1252: return encoded; } return nullptr; } uint32_t Encoding::decode(Type type, const uint8_t * encoded, uint8_t * encoded_size) { uint32_t result = 0u; uint8_t size = 0u; switch (type) { case UTF_8: { const uint_fast8_t c = *encoded; if ((c & 0x80u) == 0u) { result = c; size = 1u; } else { uint_fast8_t following_bytes = 0u; if ((c & 0xE0u) == 0xC0u) { result = c & 0x1Fu; following_bytes = 1u; } else if ((c & 0xF0u) == 0xE0u) { result = c & 0x0Fu; following_bytes = 2u; } else if ((c & 0xF8u) == 0xF0u) { result = c & 0x07u; following_bytes = 3u; } else if ((c & 0xFCu) == 0xF8u) { result = c & 0x03u; following_bytes = 4u; } else if ((c & 0xFEu) == 0xFCu) { result = c & 0x01u; following_bytes = 5u; } size = following_bytes + 1u; while (following_bytes-- > 0u) { encoded++; result <<= 6u; result |= *encoded & 0x3Fu; } } } break; case CP_1252: /* TODO: map byte to code point */ result = *encoded; size = 1u; break; } if (encoded_size != nullptr) { *encoded_size = size; } return result; } uint8_t Encoding::num_bytes_to_encode_code_point(uint32_t code_point, Type type) { switch (type) { case UTF_8: if (code_point <= 0x7Fu) { return 1u; } else if (code_point <= 0x7FFu) { return 2u; } else if (code_point <= 0xFFFFu) { return 3u; } else if (code_point <= 0x1FFFFFu) { return 4u; } else if (code_point <= 0x3FFFFFFu) { return 5u; } else { return 6u; } break; case CP_1252: if (code_point <= 0xFFu) { return 1u; } else { return 0u; } break; } return 0u; } uint8_t Encoding::encode(uint32_t code_point, Type type, uint8_t * buffer) { switch (type) { case UTF_8: { uint8_t size = num_bytes_to_encode_code_point(code_point, type); if (size == 1u) { *buffer = code_point; } else if (size > 1u) { *buffer = (0xFFu << (8u - size)) | (code_point >> (6u * (size - 1u))); for (uint8_t i = 0u, end = size - 1u; i < end; i++) { buffer[end - i] = 0x80u | (code_point >> (6u * i) & 0x3Fu); } } return size; } break; case CP_1252: if (code_point <= 0xFFu) { *buffer = code_point; return 1u; } break; } return 0u; }