From 401eb1d960d895fcfd24dc3096fb68c7b06dfe23 Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Fri, 27 Nov 2020 21:37:53 -0500 Subject: [PATCH] Completed migrating Encoding to D. --- src-c/core/Encoding.cc | 244 ----------------------------------------- src-c/core/Encoding.h | 29 ----- 2 files changed, 273 deletions(-) delete mode 100644 src-c/core/Encoding.cc delete mode 100644 src-c/core/Encoding.h diff --git a/src-c/core/Encoding.cc b/src-c/core/Encoding.cc deleted file mode 100644 index 10bdd9f..0000000 --- a/src-c/core/Encoding.cc +++ /dev/null @@ -1,244 +0,0 @@ -#include "Encoding.h" - -static inline bool -check_continuation_bytes(const uint8_t ** buffer, - uint8_t continuation_bytes_to_check) -{ - for (uint8_t i = 0; i < continuation_bytes_to_check; i++) - { - if ((**buffer & 0xC0u) != 0x80u) - return false; - (*buffer)++; - } - return true; -} - -Encoding::Type Encoding::detect_encoding(const uint8_t * buffer, size_t length) -{ - for (size_t i = 0u; i < length; i++) - { - const uint8_t c = *buffer; - uint8_t continuation_bytes_to_check; - if ((c & 0x80u) == 0u) - { - buffer++; - continue; - } - else if ((c & 0xE0u) == 0xC0u) - { - continuation_bytes_to_check = 1u; - } - else if ((c & 0xF0u) == 0xE0u) - { - continuation_bytes_to_check = 2u; - } - else if ((c & 0xF8u) == 0xF0u) - { - continuation_bytes_to_check = 3u; - } - else if ((c & 0xFCu) == 0xF8u) - { - continuation_bytes_to_check = 4u; - } - else if ((c & 0xFEu) == 0xFCu) - { - continuation_bytes_to_check = 5u; - } - else - { - return CP_1252; - } - buffer++; - if (((i + continuation_bytes_to_check) < length) && - check_continuation_bytes(&buffer, continuation_bytes_to_check)) - { - i += continuation_bytes_to_check; - continue; - } - return CP_1252; - } - - return UTF_8; -} - -uint8_t Encoding::num_bytes_in_code_point(Type type, const uint8_t * encoded) -{ - switch (type) - { - case UTF_8: - { - if ((*encoded & 0x80u) == 0u) - return 1u; - encoded++; - uint8_t n = 1u; - while ((*encoded++ & 0xC0u) == 0x80u) - n++; - return n; - } - break; - case CP_1252: - return 1u; - } - return 0u; -} - -const uint8_t * Encoding::beginning_of_code_point(Type type, const uint8_t * encoded) -{ - switch (type) - { - case UTF_8: - while ((*encoded & 0xC0u) == 0x80u) - encoded--; - return encoded; - case CP_1252: - return encoded; - } - - return nullptr; -} - -uint32_t Encoding::decode(Type type, const uint8_t * encoded, uint8_t * encoded_size) -{ - uint32_t result = 0u; - uint8_t size = 0u; - switch (type) - { - case UTF_8: - { - const uint_fast8_t c = *encoded; - if ((c & 0x80u) == 0u) - { - result = c; - size = 1u; - } - else - { - uint_fast8_t following_bytes = 0u; - if ((c & 0xE0u) == 0xC0u) - { - result = c & 0x1Fu; - following_bytes = 1u; - } - else if ((c & 0xF0u) == 0xE0u) - { - result = c & 0x0Fu; - following_bytes = 2u; - } - else if ((c & 0xF8u) == 0xF0u) - { - result = c & 0x07u; - following_bytes = 3u; - } - else if ((c & 0xFCu) == 0xF8u) - { - result = c & 0x03u; - following_bytes = 4u; - } - else if ((c & 0xFEu) == 0xFCu) - { - result = c & 0x01u; - following_bytes = 5u; - } - size = following_bytes + 1u; - while (following_bytes-- > 0u) - { - encoded++; - result <<= 6u; - result |= *encoded & 0x3Fu; - } - } - } - break; - case CP_1252: - /* TODO: map byte to code point */ - result = *encoded; - size = 1u; - break; - } - - if (encoded_size != nullptr) - { - *encoded_size = size; - } - - return result; -} - -uint8_t Encoding::num_bytes_to_encode_code_point(uint32_t code_point, Type type) -{ - switch (type) - { - case UTF_8: - if (code_point <= 0x7Fu) - { - return 1u; - } - else if (code_point <= 0x7FFu) - { - return 2u; - } - else if (code_point <= 0xFFFFu) - { - return 3u; - } - else if (code_point <= 0x1FFFFFu) - { - return 4u; - } - else if (code_point <= 0x3FFFFFFu) - { - return 5u; - } - else - { - return 6u; - } - break; - case CP_1252: - if (code_point <= 0xFFu) - { - return 1u; - } - else - { - return 0u; - } - break; - } - - return 0u; -} - -uint8_t Encoding::encode(uint32_t code_point, Type type, uint8_t * buffer) -{ - switch (type) - { - case UTF_8: - { - uint8_t size = num_bytes_to_encode_code_point(code_point, type); - if (size == 1u) - { - *buffer = code_point; - } - else if (size > 1u) - { - *buffer = (0xFFu << (8u - size)) | (code_point >> (6u * (size - 1u))); - for (uint8_t i = 0u, end = size - 1u; i < end; i++) - { - buffer[end - i] = 0x80u | (code_point >> (6u * i) & 0x3Fu); - } - } - return size; - } - break; - case CP_1252: - if (code_point <= 0xFFu) - { - *buffer = code_point; - return 1u; - } - break; - } - - return 0u; -} diff --git a/src-c/core/Encoding.h b/src-c/core/Encoding.h deleted file mode 100644 index 97bf935..0000000 --- a/src-c/core/Encoding.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef ENCODING_H -#define ENCODING_H - -#include -#include - -class Encoding -{ -public: - enum Type - { - UTF_8, - CP_1252, - }; - - enum : uint32_t - { - MAX_CODE_POINT_SIZE = 8u, - }; - - static Type detect_encoding(const uint8_t * buffer, size_t length); - static uint8_t num_bytes_in_code_point(Type type, const uint8_t * encoded); - static const uint8_t * beginning_of_code_point(Type type, const uint8_t * encoded); - static uint32_t decode(Type type, const uint8_t * encoded, uint8_t * encoded_size = nullptr); - static uint8_t num_bytes_to_encode_code_point(uint32_t code_point, Type type); - static uint8_t encode(uint32_t code_point, Type type, uint8_t * buffer); -}; - -#endif