From 65468bd2e57d7549c83789daf18afc15ee89d50d Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Tue, 1 Nov 2016 20:53:54 -0400 Subject: [PATCH] Change TextLoader for gap buffer approach - for CR line endings, convert all CR to LF - for CRLF line endings, convert all CRLF to LF --- src/core/TextLoader.cc | 85 ++++++++++++++++++++++++++----------- src/core/TextLoader.h | 15 ++----- test/src/test_TextLoader.cc | 48 ++++++++++----------- 3 files changed, 85 insertions(+), 63 deletions(-) diff --git a/src/core/TextLoader.cc b/src/core/TextLoader.cc index 73a982c..535dc40 100644 --- a/src/core/TextLoader.cc +++ b/src/core/TextLoader.cc @@ -1,68 +1,65 @@ #include "TextLoader.h" #include +#include /** Create a TextLoader. */ TextLoader::TextLoader() { m_line_endings = LineEndings::LF; m_encoding = Encoding::UTF_8; - m_lines = NULL; m_eol_at_eof = true; + m_num_lines = 0u; } /** * Scan text to detect line endings and record their positions. * * @param buffer Buffer containing the text to import. - * @param size Size of the buffer. + * @param size Size of the text to load. The buffer must be at least one + * byte larger than this. + * @param out_size Size of the loaded buffer. */ -void TextLoader::load_buffer(uint8_t * buffer, size_t size) +void TextLoader::load_buffer(uint8_t * buffer, size_t size, size_t * out_size) { - std::shared_ptr> lines[LineEndings::COUNT]; - size_t line_start[LineEndings::COUNT] = {0}; - unsigned int n_cr = 0; - unsigned int n_lf = 0; + std::list cr_indexes; + size_t n_lines[LineEndings::COUNT] = {0}; bool crlf = true; - for (size_t i = 0; i < LineEndings::COUNT; i++) - { - lines[i] = std::make_shared>(); - } + size_t next_start_of_line_index = 0u; + size_t out_size_local = size; for (size_t i = 0; i < size; i++) { if (buffer[i] == '\r') { - lines[LineEndings::CR]->push_back(Span(&buffer[line_start[LineEndings::CR]], i - line_start[LineEndings::CR])); - n_cr++; - line_start[LineEndings::CR] = i + 1; + cr_indexes.push_back(i); + n_lines[LineEndings::CR]++; if (crlf) { if ((i < (size - 1)) && (buffer[i + 1] == '\n')) { - lines[LineEndings::CRLF]->push_back(Span(&buffer[line_start[LineEndings::CRLF]], i - line_start[LineEndings::CRLF])); - n_lf++; + n_lines[LineEndings::LF]++; + n_lines[LineEndings::CRLF]++; i++; - line_start[LineEndings::CRLF] = i + 1; } else { crlf = false; } } + next_start_of_line_index = i + 1; } else if (buffer[i] == '\n') { - lines[LineEndings::LF]->push_back(Span(&buffer[line_start[LineEndings::LF]], i - line_start[LineEndings::LF])); crlf = false; - n_lf++; - line_start[LineEndings::LF] = i + 1; + n_lines[LineEndings::LF]++; + next_start_of_line_index = i + 1; } } - if (crlf && (n_lf > 0u)) + if (crlf && (n_lines[LineEndings::LF] > 0u)) { m_line_endings = LineEndings::CRLF; } - else if ((n_cr > 0u) && (n_lf == 0u)) + else if ((n_lines[LineEndings::CR] > 0u) && (n_lines[LineEndings::LF] == 0u)) { m_line_endings = LineEndings::CR; } @@ -71,15 +68,53 @@ void TextLoader::load_buffer(uint8_t * buffer, size_t size) m_line_endings = LineEndings::LF; } - m_lines = lines[m_line_endings]; + m_num_lines = n_lines[m_line_endings]; /* Check if there is a line that was not terminated by a EOL sequence at * the end of the file. */ - if (line_start[m_line_endings] < size) + if (next_start_of_line_index < size) { - m_lines->push_back(Span(&buffer[line_start[m_line_endings]], size - line_start[m_line_endings])); + m_num_lines++; m_eol_at_eof = false; + if (crlf) + { + cr_indexes.push_back(size); + } + else + { + buffer[size] = '\n'; + out_size_local++; + } } m_encoding = Encoding::detect_encoding(buffer, size); + + if (m_line_endings == LineEndings::CRLF) + { + /* Compress all CRLF sequences to LF in memory. */ + size_t dest = 0u; + size_t src = 0u; + for (auto cr_index : cr_indexes) + { + size_t size = cr_index - src; + if ((src != dest) && (size > 0u)) + { + memmove(&buffer[dest], &buffer[src], size); + } + dest += size; + src = cr_index + 2u; + buffer[dest++] = '\n'; + } + out_size_local = dest; + } + else if (m_line_endings == LineEndings::CR) + { + /* Convert all \r to \n */ + for (auto cr_index : cr_indexes) + { + buffer[cr_index] = '\n'; + } + } + + *out_size = out_size_local; } diff --git a/src/core/TextLoader.h b/src/core/TextLoader.h index 18ad8bd..68ea6d1 100644 --- a/src/core/TextLoader.h +++ b/src/core/TextLoader.h @@ -12,29 +12,20 @@ class TextLoader { public: TextLoader(); - void load_buffer(uint8_t * buffer, size_t size); + void load_buffer(uint8_t * buffer, size_t size, size_t * out_size); size_t num_lines() { - if (m_lines == nullptr) - { - return 0u; - } - else - { - return m_lines->size(); - } + return m_num_lines; } LineEndings::Type get_line_endings() { return m_line_endings; } Encoding::Type get_encoding() { return m_encoding; } - auto begin() { return m_lines->begin(); } - auto end() { return m_lines->end(); } bool get_eol_at_eof() { return m_eol_at_eof; } protected: LineEndings::Type m_line_endings; Encoding::Type m_encoding; bool m_eol_at_eof; - std::shared_ptr> m_lines; + size_t m_num_lines; }; #endif diff --git a/test/src/test_TextLoader.cc b/test/src/test_TextLoader.cc index bd5ade9..bab9618 100644 --- a/test/src/test_TextLoader.cc +++ b/test/src/test_TextLoader.cc @@ -6,11 +6,6 @@ using namespace std; -static string line_to_string(const std::list::iterator & it) -{ - return string((char *)it->start, it->length); -} - TEST(TextLoaderTest, num_lines_defaults_to_0) { TextLoader tl; @@ -21,7 +16,8 @@ TEST(TextLoaderTest, loading_empty_file) { TextLoader tl; auto file = TestSupport::read_file("test/files/empty.txt"); - tl.load_buffer(&(*file)[0], file->size()); + size_t loaded_size; + tl.load_buffer(&(*file)[0], file->size(), &loaded_size); EXPECT_EQ(0u, tl.num_lines()); EXPECT_TRUE(tl.get_eol_at_eof()); } @@ -30,53 +26,53 @@ TEST(TextLoaderTest, detects_lf_line_endings) { TextLoader tl; auto file = TestSupport::read_file("test/files/line_endings/lf_format.txt"); - tl.load_buffer(&(*file)[0], file->size()); + size_t loaded_size; + tl.load_buffer(&(*file)[0], file->size(), &loaded_size); EXPECT_EQ(LineEndings::LF, tl.get_line_endings()); ASSERT_EQ(2u, tl.num_lines()); - auto it = tl.begin(); - EXPECT_EQ("Hello.", line_to_string(it)); - it++; - EXPECT_EQ("This file is in LF line ending format.", line_to_string(it)); + string expected = "Hello.\nThis file is in LF line ending format.\n"; + EXPECT_EQ(expected, string((char *)&(*file)[0], expected.size())); EXPECT_TRUE(tl.get_eol_at_eof()); + EXPECT_EQ(expected.size(), loaded_size); } TEST(TextLoaderTest, detects_cr_line_endings) { TextLoader tl; auto file = TestSupport::read_file("test/files/line_endings/cr_format.txt"); - tl.load_buffer(&(*file)[0], file->size()); + size_t loaded_size; + tl.load_buffer(&(*file)[0], file->size(), &loaded_size); EXPECT_EQ(LineEndings::CR, tl.get_line_endings()); ASSERT_EQ(2u, tl.num_lines()); - auto it = tl.begin(); - EXPECT_EQ("Hello.", line_to_string(it)); - it++; - EXPECT_EQ("This file is in CR line ending format.", line_to_string(it)); + string expected = "Hello.\nThis file is in CR line ending format.\n"; + EXPECT_EQ(expected, string((char *)&(*file)[0], expected.size())); EXPECT_TRUE(tl.get_eol_at_eof()); + EXPECT_EQ(expected.size(), loaded_size); } TEST(TextLoaderTest, detects_crlf_line_endings) { TextLoader tl; auto file = TestSupport::read_file("test/files/line_endings/crlf_format.txt"); - tl.load_buffer(&(*file)[0], file->size()); + size_t loaded_size; + tl.load_buffer(&(*file)[0], file->size(), &loaded_size); EXPECT_EQ(LineEndings::CRLF, tl.get_line_endings()); ASSERT_EQ(2u, tl.num_lines()); - auto it = tl.begin(); - EXPECT_EQ("Hello.", line_to_string(it)); - it++; - EXPECT_EQ("This file is in CRLF line ending format.", line_to_string(it)); + string expected = "Hello.\nThis file is in CRLF line ending format.\n"; + EXPECT_EQ(expected, string((char *)&(*file)[0], expected.size())); EXPECT_TRUE(tl.get_eol_at_eof()); + EXPECT_EQ(expected.size(), loaded_size); } TEST(TextLoaderTest, properly_reads_files_that_do_not_end_in_a_eol_sequence) { TextLoader tl; auto file = TestSupport::read_file("test/files/no_eol_at_eof.txt"); - tl.load_buffer(&(*file)[0], file->size()); + size_t loaded_size; + tl.load_buffer(&(*file)[0], file->size(), &loaded_size); ASSERT_EQ(2u, tl.num_lines()); - auto it = tl.begin(); - EXPECT_EQ("Line 1", line_to_string(it)); - it++; - EXPECT_EQ("Line 2", line_to_string(it)); + string expected = "Line 1\nLine 2\n"; + EXPECT_EQ(expected, string((char *)&(*file)[0], expected.size())); EXPECT_FALSE(tl.get_eol_at_eof()); + EXPECT_EQ(expected.size(), loaded_size); }