Change TextLoader for gap buffer approach
- for CR line endings, convert all CR to LF - for CRLF line endings, convert all CRLF to LF
This commit is contained in:
parent
1f3ecf950f
commit
65468bd2e5
@ -1,68 +1,65 @@
|
|||||||
#include "TextLoader.h"
|
#include "TextLoader.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
/** Create a TextLoader. */
|
/** Create a TextLoader. */
|
||||||
TextLoader::TextLoader()
|
TextLoader::TextLoader()
|
||||||
{
|
{
|
||||||
m_line_endings = LineEndings::LF;
|
m_line_endings = LineEndings::LF;
|
||||||
m_encoding = Encoding::UTF_8;
|
m_encoding = Encoding::UTF_8;
|
||||||
m_lines = NULL;
|
|
||||||
m_eol_at_eof = true;
|
m_eol_at_eof = true;
|
||||||
|
m_num_lines = 0u;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Scan text to detect line endings and record their positions.
|
* Scan text to detect line endings and record their positions.
|
||||||
*
|
*
|
||||||
* @param buffer Buffer containing the text to import.
|
* @param buffer Buffer containing the text to import.
|
||||||
* @param size Size of the buffer.
|
* @param size Size of the text to load. The buffer must be at least one
|
||||||
|
* byte larger than this.
|
||||||
|
* @param out_size Size of the loaded buffer.
|
||||||
*/
|
*/
|
||||||
void TextLoader::load_buffer(uint8_t * buffer, size_t size)
|
void TextLoader::load_buffer(uint8_t * buffer, size_t size, size_t * out_size)
|
||||||
{
|
{
|
||||||
std::shared_ptr<std::list<Span>> lines[LineEndings::COUNT];
|
std::list<size_t> cr_indexes;
|
||||||
size_t line_start[LineEndings::COUNT] = {0};
|
size_t n_lines[LineEndings::COUNT] = {0};
|
||||||
unsigned int n_cr = 0;
|
|
||||||
unsigned int n_lf = 0;
|
|
||||||
bool crlf = true;
|
bool crlf = true;
|
||||||
for (size_t i = 0; i < LineEndings::COUNT; i++)
|
size_t next_start_of_line_index = 0u;
|
||||||
{
|
size_t out_size_local = size;
|
||||||
lines[i] = std::make_shared<std::list<Span>>();
|
|
||||||
}
|
|
||||||
for (size_t i = 0; i < size; i++)
|
for (size_t i = 0; i < size; i++)
|
||||||
{
|
{
|
||||||
if (buffer[i] == '\r')
|
if (buffer[i] == '\r')
|
||||||
{
|
{
|
||||||
lines[LineEndings::CR]->push_back(Span(&buffer[line_start[LineEndings::CR]], i - line_start[LineEndings::CR]));
|
cr_indexes.push_back(i);
|
||||||
n_cr++;
|
n_lines[LineEndings::CR]++;
|
||||||
line_start[LineEndings::CR] = i + 1;
|
|
||||||
if (crlf)
|
if (crlf)
|
||||||
{
|
{
|
||||||
if ((i < (size - 1)) && (buffer[i + 1] == '\n'))
|
if ((i < (size - 1)) && (buffer[i + 1] == '\n'))
|
||||||
{
|
{
|
||||||
lines[LineEndings::CRLF]->push_back(Span(&buffer[line_start[LineEndings::CRLF]], i - line_start[LineEndings::CRLF]));
|
n_lines[LineEndings::LF]++;
|
||||||
n_lf++;
|
n_lines[LineEndings::CRLF]++;
|
||||||
i++;
|
i++;
|
||||||
line_start[LineEndings::CRLF] = i + 1;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
crlf = false;
|
crlf = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
next_start_of_line_index = i + 1;
|
||||||
}
|
}
|
||||||
else if (buffer[i] == '\n')
|
else if (buffer[i] == '\n')
|
||||||
{
|
{
|
||||||
lines[LineEndings::LF]->push_back(Span(&buffer[line_start[LineEndings::LF]], i - line_start[LineEndings::LF]));
|
|
||||||
crlf = false;
|
crlf = false;
|
||||||
n_lf++;
|
n_lines[LineEndings::LF]++;
|
||||||
line_start[LineEndings::LF] = i + 1;
|
next_start_of_line_index = i + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crlf && (n_lf > 0u))
|
if (crlf && (n_lines[LineEndings::LF] > 0u))
|
||||||
{
|
{
|
||||||
m_line_endings = LineEndings::CRLF;
|
m_line_endings = LineEndings::CRLF;
|
||||||
}
|
}
|
||||||
else if ((n_cr > 0u) && (n_lf == 0u))
|
else if ((n_lines[LineEndings::CR] > 0u) && (n_lines[LineEndings::LF] == 0u))
|
||||||
{
|
{
|
||||||
m_line_endings = LineEndings::CR;
|
m_line_endings = LineEndings::CR;
|
||||||
}
|
}
|
||||||
@ -71,15 +68,53 @@ void TextLoader::load_buffer(uint8_t * buffer, size_t size)
|
|||||||
m_line_endings = LineEndings::LF;
|
m_line_endings = LineEndings::LF;
|
||||||
}
|
}
|
||||||
|
|
||||||
m_lines = lines[m_line_endings];
|
m_num_lines = n_lines[m_line_endings];
|
||||||
|
|
||||||
/* Check if there is a line that was not terminated by a EOL sequence at
|
/* Check if there is a line that was not terminated by a EOL sequence at
|
||||||
* the end of the file. */
|
* the end of the file. */
|
||||||
if (line_start[m_line_endings] < size)
|
if (next_start_of_line_index < size)
|
||||||
{
|
{
|
||||||
m_lines->push_back(Span(&buffer[line_start[m_line_endings]], size - line_start[m_line_endings]));
|
m_num_lines++;
|
||||||
m_eol_at_eof = false;
|
m_eol_at_eof = false;
|
||||||
|
if (crlf)
|
||||||
|
{
|
||||||
|
cr_indexes.push_back(size);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
buffer[size] = '\n';
|
||||||
|
out_size_local++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
m_encoding = Encoding::detect_encoding(buffer, size);
|
m_encoding = Encoding::detect_encoding(buffer, size);
|
||||||
|
|
||||||
|
if (m_line_endings == LineEndings::CRLF)
|
||||||
|
{
|
||||||
|
/* Compress all CRLF sequences to LF in memory. */
|
||||||
|
size_t dest = 0u;
|
||||||
|
size_t src = 0u;
|
||||||
|
for (auto cr_index : cr_indexes)
|
||||||
|
{
|
||||||
|
size_t size = cr_index - src;
|
||||||
|
if ((src != dest) && (size > 0u))
|
||||||
|
{
|
||||||
|
memmove(&buffer[dest], &buffer[src], size);
|
||||||
|
}
|
||||||
|
dest += size;
|
||||||
|
src = cr_index + 2u;
|
||||||
|
buffer[dest++] = '\n';
|
||||||
|
}
|
||||||
|
out_size_local = dest;
|
||||||
|
}
|
||||||
|
else if (m_line_endings == LineEndings::CR)
|
||||||
|
{
|
||||||
|
/* Convert all \r to \n */
|
||||||
|
for (auto cr_index : cr_indexes)
|
||||||
|
{
|
||||||
|
buffer[cr_index] = '\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*out_size = out_size_local;
|
||||||
}
|
}
|
||||||
|
@ -12,29 +12,20 @@ class TextLoader
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
TextLoader();
|
TextLoader();
|
||||||
void load_buffer(uint8_t * buffer, size_t size);
|
void load_buffer(uint8_t * buffer, size_t size, size_t * out_size);
|
||||||
size_t num_lines()
|
size_t num_lines()
|
||||||
{
|
{
|
||||||
if (m_lines == nullptr)
|
return m_num_lines;
|
||||||
{
|
|
||||||
return 0u;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return m_lines->size();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
LineEndings::Type get_line_endings() { return m_line_endings; }
|
LineEndings::Type get_line_endings() { return m_line_endings; }
|
||||||
Encoding::Type get_encoding() { return m_encoding; }
|
Encoding::Type get_encoding() { return m_encoding; }
|
||||||
auto begin() { return m_lines->begin(); }
|
|
||||||
auto end() { return m_lines->end(); }
|
|
||||||
bool get_eol_at_eof() { return m_eol_at_eof; }
|
bool get_eol_at_eof() { return m_eol_at_eof; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
LineEndings::Type m_line_endings;
|
LineEndings::Type m_line_endings;
|
||||||
Encoding::Type m_encoding;
|
Encoding::Type m_encoding;
|
||||||
bool m_eol_at_eof;
|
bool m_eol_at_eof;
|
||||||
std::shared_ptr<std::list<Span>> m_lines;
|
size_t m_num_lines;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -6,11 +6,6 @@
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
static string line_to_string(const std::list<Span>::iterator & it)
|
|
||||||
{
|
|
||||||
return string((char *)it->start, it->length);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(TextLoaderTest, num_lines_defaults_to_0)
|
TEST(TextLoaderTest, num_lines_defaults_to_0)
|
||||||
{
|
{
|
||||||
TextLoader tl;
|
TextLoader tl;
|
||||||
@ -21,7 +16,8 @@ TEST(TextLoaderTest, loading_empty_file)
|
|||||||
{
|
{
|
||||||
TextLoader tl;
|
TextLoader tl;
|
||||||
auto file = TestSupport::read_file("test/files/empty.txt");
|
auto file = TestSupport::read_file("test/files/empty.txt");
|
||||||
tl.load_buffer(&(*file)[0], file->size());
|
size_t loaded_size;
|
||||||
|
tl.load_buffer(&(*file)[0], file->size(), &loaded_size);
|
||||||
EXPECT_EQ(0u, tl.num_lines());
|
EXPECT_EQ(0u, tl.num_lines());
|
||||||
EXPECT_TRUE(tl.get_eol_at_eof());
|
EXPECT_TRUE(tl.get_eol_at_eof());
|
||||||
}
|
}
|
||||||
@ -30,53 +26,53 @@ TEST(TextLoaderTest, detects_lf_line_endings)
|
|||||||
{
|
{
|
||||||
TextLoader tl;
|
TextLoader tl;
|
||||||
auto file = TestSupport::read_file("test/files/line_endings/lf_format.txt");
|
auto file = TestSupport::read_file("test/files/line_endings/lf_format.txt");
|
||||||
tl.load_buffer(&(*file)[0], file->size());
|
size_t loaded_size;
|
||||||
|
tl.load_buffer(&(*file)[0], file->size(), &loaded_size);
|
||||||
EXPECT_EQ(LineEndings::LF, tl.get_line_endings());
|
EXPECT_EQ(LineEndings::LF, tl.get_line_endings());
|
||||||
ASSERT_EQ(2u, tl.num_lines());
|
ASSERT_EQ(2u, tl.num_lines());
|
||||||
auto it = tl.begin();
|
string expected = "Hello.\nThis file is in LF line ending format.\n";
|
||||||
EXPECT_EQ("Hello.", line_to_string(it));
|
EXPECT_EQ(expected, string((char *)&(*file)[0], expected.size()));
|
||||||
it++;
|
|
||||||
EXPECT_EQ("This file is in LF line ending format.", line_to_string(it));
|
|
||||||
EXPECT_TRUE(tl.get_eol_at_eof());
|
EXPECT_TRUE(tl.get_eol_at_eof());
|
||||||
|
EXPECT_EQ(expected.size(), loaded_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(TextLoaderTest, detects_cr_line_endings)
|
TEST(TextLoaderTest, detects_cr_line_endings)
|
||||||
{
|
{
|
||||||
TextLoader tl;
|
TextLoader tl;
|
||||||
auto file = TestSupport::read_file("test/files/line_endings/cr_format.txt");
|
auto file = TestSupport::read_file("test/files/line_endings/cr_format.txt");
|
||||||
tl.load_buffer(&(*file)[0], file->size());
|
size_t loaded_size;
|
||||||
|
tl.load_buffer(&(*file)[0], file->size(), &loaded_size);
|
||||||
EXPECT_EQ(LineEndings::CR, tl.get_line_endings());
|
EXPECT_EQ(LineEndings::CR, tl.get_line_endings());
|
||||||
ASSERT_EQ(2u, tl.num_lines());
|
ASSERT_EQ(2u, tl.num_lines());
|
||||||
auto it = tl.begin();
|
string expected = "Hello.\nThis file is in CR line ending format.\n";
|
||||||
EXPECT_EQ("Hello.", line_to_string(it));
|
EXPECT_EQ(expected, string((char *)&(*file)[0], expected.size()));
|
||||||
it++;
|
|
||||||
EXPECT_EQ("This file is in CR line ending format.", line_to_string(it));
|
|
||||||
EXPECT_TRUE(tl.get_eol_at_eof());
|
EXPECT_TRUE(tl.get_eol_at_eof());
|
||||||
|
EXPECT_EQ(expected.size(), loaded_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(TextLoaderTest, detects_crlf_line_endings)
|
TEST(TextLoaderTest, detects_crlf_line_endings)
|
||||||
{
|
{
|
||||||
TextLoader tl;
|
TextLoader tl;
|
||||||
auto file = TestSupport::read_file("test/files/line_endings/crlf_format.txt");
|
auto file = TestSupport::read_file("test/files/line_endings/crlf_format.txt");
|
||||||
tl.load_buffer(&(*file)[0], file->size());
|
size_t loaded_size;
|
||||||
|
tl.load_buffer(&(*file)[0], file->size(), &loaded_size);
|
||||||
EXPECT_EQ(LineEndings::CRLF, tl.get_line_endings());
|
EXPECT_EQ(LineEndings::CRLF, tl.get_line_endings());
|
||||||
ASSERT_EQ(2u, tl.num_lines());
|
ASSERT_EQ(2u, tl.num_lines());
|
||||||
auto it = tl.begin();
|
string expected = "Hello.\nThis file is in CRLF line ending format.\n";
|
||||||
EXPECT_EQ("Hello.", line_to_string(it));
|
EXPECT_EQ(expected, string((char *)&(*file)[0], expected.size()));
|
||||||
it++;
|
|
||||||
EXPECT_EQ("This file is in CRLF line ending format.", line_to_string(it));
|
|
||||||
EXPECT_TRUE(tl.get_eol_at_eof());
|
EXPECT_TRUE(tl.get_eol_at_eof());
|
||||||
|
EXPECT_EQ(expected.size(), loaded_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(TextLoaderTest, properly_reads_files_that_do_not_end_in_a_eol_sequence)
|
TEST(TextLoaderTest, properly_reads_files_that_do_not_end_in_a_eol_sequence)
|
||||||
{
|
{
|
||||||
TextLoader tl;
|
TextLoader tl;
|
||||||
auto file = TestSupport::read_file("test/files/no_eol_at_eof.txt");
|
auto file = TestSupport::read_file("test/files/no_eol_at_eof.txt");
|
||||||
tl.load_buffer(&(*file)[0], file->size());
|
size_t loaded_size;
|
||||||
|
tl.load_buffer(&(*file)[0], file->size(), &loaded_size);
|
||||||
ASSERT_EQ(2u, tl.num_lines());
|
ASSERT_EQ(2u, tl.num_lines());
|
||||||
auto it = tl.begin();
|
string expected = "Line 1\nLine 2\n";
|
||||||
EXPECT_EQ("Line 1", line_to_string(it));
|
EXPECT_EQ(expected, string((char *)&(*file)[0], expected.size()));
|
||||||
it++;
|
|
||||||
EXPECT_EQ("Line 2", line_to_string(it));
|
|
||||||
EXPECT_FALSE(tl.get_eol_at_eof());
|
EXPECT_FALSE(tl.get_eol_at_eof());
|
||||||
|
EXPECT_EQ(expected.size(), loaded_size);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user