diff --git a/clickhouse/base/wire_format.h b/clickhouse/base/wire_format.h index f383fcbc..b6c62f1f 100644 --- a/clickhouse/base/wire_format.h +++ b/clickhouse/base/wire_format.h @@ -23,7 +23,7 @@ class WireFormat { static void WriteBytes(CodedOutputStream* output, const void* buf, size_t len); - static void WriteString(CodedOutputStream* output, const std::string& value); + static void WriteString(CodedOutputStream* output, std::string_view value); static void WriteUInt64(CodedOutputStream* output, const uint64_t value); }; @@ -85,7 +85,7 @@ inline void WireFormat::WriteBytes( inline void WireFormat::WriteString( CodedOutputStream* output, - const std::string& value) + std::string_view value) { output->WriteVarint64(value.size()); output->WriteRaw(value.data(), value.size()); diff --git a/clickhouse/columns/ip6.cpp b/clickhouse/columns/ip6.cpp index 47e07445..7c814a78 100644 --- a/clickhouse/columns/ip6.cpp +++ b/clickhouse/columns/ip6.cpp @@ -43,7 +43,7 @@ std::string ColumnIPv6::AsString (size_t n) const{ char buf[INET6_ADDRSTRLEN]; const char* ip_str = inet_ntop(AF_INET6, addr.data(), buf, INET6_ADDRSTRLEN); if (ip_str == nullptr) { - throw std::runtime_error("invalid IPv6 format: " + addr); + throw std::runtime_error("invalid IPv6 format: " + std::string(addr)); } return ip_str; } diff --git a/clickhouse/columns/string.cpp b/clickhouse/columns/string.cpp index 2c22c6fb..eb3acf88 100644 --- a/clickhouse/columns/string.cpp +++ b/clickhouse/columns/string.cpp @@ -3,6 +3,26 @@ #include "../base/wire_format.h" +namespace +{ +const size_t DEFAULT_BLOCK_SIZE = 4096; + +template +size_t ComputeTotalSize(const Container & strings, size_t begin = 0, size_t len = -1) +{ + size_t result = 0; + if (begin < strings.size()) { + len = std::min(len, strings.size() - begin); + + for (size_t i = begin; i < begin + len; ++i) + result += strings[i].size(); + } + + return result; +} + +} + namespace clickhouse { ColumnFixedString::ColumnFixedString(size_t n) @@ -11,21 +31,29 @@ ColumnFixedString::ColumnFixedString(size_t n) { } -void ColumnFixedString::Append(const std::string& str) { - data_.push_back(str); - data_.back().resize(string_size_); +void ColumnFixedString::Append(std::string_view str) { + if (data_.capacity() < str.size()) + { + // round up to the next block size + const auto new_size = (((data_.size() + string_size_) / DEFAULT_BLOCK_SIZE) + 1) * DEFAULT_BLOCK_SIZE; + data_.reserve(new_size); + } + + data_.insert(data_.size(), str); } void ColumnFixedString::Clear() { data_.clear(); } -const std::string& ColumnFixedString::At(size_t n) const { - return data_.at(n); +std::string_view ColumnFixedString::At(size_t n) const { + const auto pos = n * string_size_; + return std::string_view(&data_.at(pos), string_size_); } -const std::string& ColumnFixedString::operator [] (size_t n) const { - return data_[n]; +std::string_view ColumnFixedString::operator [](size_t n) const { + const auto pos = n * string_size_; + return std::string_view(&data_[pos], string_size_); } size_t ColumnFixedString::FixedSize() const @@ -42,104 +70,187 @@ void ColumnFixedString::Append(ColumnRef column) { } bool ColumnFixedString::Load(CodedInputStream* input, size_t rows) { - data_.reserve(data_.size() + rows); - - for (size_t i = 0; i < rows; ++i) { - std::string s; - s.resize(string_size_); - - if (!WireFormat::ReadBytes(input, &s[0], s.size())) { - return false; - } - - data_.push_back(std::move(s)); + data_.resize(string_size_ * rows); + if (!WireFormat::ReadBytes(input, &data_[0], data_.size())) { + return false; } return true; } void ColumnFixedString::Save(CodedOutputStream* output) { - for (size_t i = 0; i < data_.size(); ++i) { - WireFormat::WriteBytes(output, data_[i].data(), string_size_); - } + WireFormat::WriteBytes(output, data_.data(), data_.size()); } size_t ColumnFixedString::Size() const { - return data_.size(); + return data_.size() / string_size_; } ColumnRef ColumnFixedString::Slice(size_t begin, size_t len) { auto result = std::make_shared(string_size_); - if (begin < data_.size()) { - result->data_ = SliceVector(data_, begin, len); + if (begin < Size()) { + const auto b = begin * string_size_; + const auto l = len * string_size_; + result->data_ = data_.substr(b, std::min(data_.size() - b, l)); } return result; } +struct ColumnString::Block +{ + using CharT = typename std::string::value_type; + + explicit Block(size_t starting_capacity) + : size(0), + capacity(starting_capacity), + data_(new CharT[capacity]) + {} + + inline auto GetAvailble() const + { + return capacity - size; + } + + std::string_view AppendUnsafe(std::string_view str) + { + const auto pos = &data_[size]; + + memcpy(pos, str.data(), str.size()); + size += str.size(); + + return std::string_view(pos, str.size()); + } + + auto GetCurrentWritePos() + { + return &data_[size]; + } + + std::string_view ConsumeTailAsStringViewUnsafe(size_t len) + { + const auto start = &data_[size]; + size += len; + return std::string_view(start, len); + } + + size_t size; + const size_t capacity; + std::unique_ptr data_; +}; ColumnString::ColumnString() : Column(Type::CreateString()) { } -ColumnString::ColumnString(const std::vector& data) +ColumnString::ColumnString(const std::vector & data) : Column(Type::CreateString()) - , data_(data) { + items_.reserve(data.size()); + blocks_.emplace_back(ComputeTotalSize(data)); + + for (const auto & s : data) + { + AppendUnsafe(s); + } } -void ColumnString::Append(const std::string& str) { - data_.push_back(str); +ColumnString::~ColumnString() +{} + +void ColumnString::Append(std::string_view str) { + if (blocks_.size() == 0 || blocks_.back().GetAvailble() < str.length()) + { + blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, str.size())); + } + + items_.emplace_back(blocks_.back().AppendUnsafe(str)); +} + +void ColumnString::AppendUnsafe(std::string_view str) +{ + items_.emplace_back(blocks_.back().AppendUnsafe(str)); } void ColumnString::Clear() { - data_.clear(); + items_.clear(); + blocks_.clear(); } -const std::string& ColumnString::At(size_t n) const { - return data_.at(n); +std::string_view ColumnString::At(size_t n) const { + return items_.at(n); } -const std::string& ColumnString::operator [] (size_t n) const { - return data_[n]; +std::string_view ColumnString::operator [] (size_t n) const { + return items_[n]; } void ColumnString::Append(ColumnRef column) { if (auto col = column->As()) { - data_.insert(data_.end(), col->data_.begin(), col->data_.end()); + const auto total_size = ComputeTotalSize(col->items_); + + // TODO: fill up existing block with some items and then add a new one for the rest of items + if (blocks_.size() == 0 || blocks_.back().GetAvailble() < total_size) + blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, total_size)); + items_.reserve(items_.size() + col->Size()); + + for (size_t i = 0; i < column->Size(); ++i) { + this->AppendUnsafe((*col)[i]); + } } } bool ColumnString::Load(CodedInputStream* input, size_t rows) { - data_.reserve(data_.size() + rows); + items_.clear(); + blocks_.clear(); + + items_.reserve(rows); + Block * block = nullptr; + // TODO(performance): unroll a loop to a first row (to get rid of `blocks_.size() == 0` check) and the rest. for (size_t i = 0; i < rows; ++i) { - std::string s; + uint64_t len; + if (!WireFormat::ReadUInt64(input, &len)) + return false; + + if (blocks_.size() == 0 || len > block->GetAvailble()) + block = &blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, len)); - if (!WireFormat::ReadString(input, &s)) { + if (!WireFormat::ReadBytes(input, block->GetCurrentWritePos(), len)) return false; - } - data_.push_back(std::move(s)); + items_.emplace_back(block->ConsumeTailAsStringViewUnsafe(len)); } return true; } void ColumnString::Save(CodedOutputStream* output) { - for (auto si = data_.begin(); si != data_.end(); ++si) { - WireFormat::WriteString(output, *si); + for (const auto & item : items_) { + WireFormat::WriteString(output, item); } } size_t ColumnString::Size() const { - return data_.size(); + return items_.size(); } ColumnRef ColumnString::Slice(size_t begin, size_t len) { - return std::make_shared(SliceVector(data_, begin, len)); + auto result = std::make_shared(); + + if (begin < items_.size()) { + len = std::min(len, items_.size() - begin); + + result->blocks_.emplace_back(ComputeTotalSize(items_, begin, len)); + for (size_t i = begin; i < begin + len; ++i) + { + result->Append(items_[i]); + } + } + + return result; } } diff --git a/clickhouse/columns/string.h b/clickhouse/columns/string.h index 981e3426..ae481e58 100644 --- a/clickhouse/columns/string.h +++ b/clickhouse/columns/string.h @@ -2,6 +2,11 @@ #include "column.h" +#include +#include +#include +#include + namespace clickhouse { /** @@ -12,13 +17,13 @@ class ColumnFixedString : public Column { explicit ColumnFixedString(size_t n); /// Appends one element to the column. - void Append(const std::string& str); + void Append(std::string_view str); /// Returns element at given row number. - const std::string& At(size_t n) const; + std::string_view At(size_t n) const; /// Returns element at given row number. - const std::string& operator [] (size_t n) const; + std::string_view operator [] (size_t n) const; /// Returns the max size of the fixed string size_t FixedSize() const; @@ -44,7 +49,7 @@ class ColumnFixedString : public Column { private: const size_t string_size_; - std::vector data_; + std::string data_; }; /** @@ -53,16 +58,18 @@ class ColumnFixedString : public Column { class ColumnString : public Column { public: ColumnString(); - explicit ColumnString(const std::vector& data); + ~ColumnString(); + + explicit ColumnString(const std::vector & data); /// Appends one element to the column. - void Append(const std::string& str); + void Append(std::string_view str); /// Returns element at given row number. - const std::string& At(size_t n) const; + std::string_view At(size_t n) const; /// Returns element at given row number. - const std::string& operator [] (size_t n) const; + std::string_view operator [] (size_t n) const; public: /// Appends content of given column to the end of current one. @@ -84,7 +91,13 @@ class ColumnString : public Column { ColumnRef Slice(size_t begin, size_t len) override; private: - std::vector data_; + void AppendUnsafe(std::string_view); + +private: + struct Block; + + std::vector items_; + std::vector blocks_; }; } diff --git a/tests/simple/main.cpp b/tests/simple/main.cpp index fe09068a..8b2d1089 100644 --- a/tests/simple/main.cpp +++ b/tests/simple/main.cpp @@ -369,9 +369,10 @@ inline void EnumExample(Client& client) { } inline void SelectNull(Client& client) { - client.Select("SELECT NULL", [](const Block& block) + client.Select("SELECT NULL", []([[maybe_unused]] const Block& block) { assert(block.GetRowCount() < 2); + (void)(block); } ); } diff --git a/ut/client_ut.cpp b/ut/client_ut.cpp index c07e9606..c4e984b6 100644 --- a/ut/client_ut.cpp +++ b/ut/client_ut.cpp @@ -13,7 +13,8 @@ class ClientCase : public testing::TestWithParam { } void TearDown() override { - client_->Execute("DROP DATABASE test_clickhouse_cpp"); + if (client_) + client_->Execute("DROP DATABASE test_clickhouse_cpp"); delete client_; } diff --git a/ut/columns_ut.cpp b/ut/columns_ut.cpp index a1bf8c42..bfaf7d2d 100644 --- a/ut/columns_ut.cpp +++ b/ut/columns_ut.cpp @@ -7,6 +7,7 @@ #include #include +#include using namespace clickhouse; @@ -171,3 +172,214 @@ TEST(ColumnsCase, UUIDSlice) { ASSERT_EQ(sub->At(0), UInt128(0x84b9f24bc26b49c6llu, 0xa03b4ab723341951llu)); ASSERT_EQ(sub->At(1), UInt128(0x3507213c178649f9llu, 0x9faf035d662f60aellu)); } + + + +std::uint64_t generate(const ColumnUInt64&, size_t index) +{ + const auto base = static_cast(index) % 255; + return base << 7*8 | base << 6*8 | base << 5*8 | base << 4*8 | base << 3*8 | base << 2*8 | base << 1*8 | base; +} + +template +std::string_view generate_string_view(size_t index) +{ + static const char result_template[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + const auto template_size = sizeof(result_template) - 1; + + const auto start_pos = index % (template_size - RESULT_SIZE); + return std::string_view(&result_template[start_pos], RESULT_SIZE); +} + +std::string_view generate(const ColumnString&, size_t index) +{ + // ColumString stores item lengts,and on 1M etnries that builds up to extra 1M bytes, + // comparing to 8M bytes of serialized data for ColumnFixedString and ColumUInt64. + // So in order to make comparison mode fair, reducing size of data item. + return generate_string_view<7>(index); +} + +std::string_view generate(const ColumnFixedString&, size_t index) +{ + return generate_string_view<8>(index); +} + +template +struct Timer +{ + using DurationType = ChronoDurationType; + + Timer() {} + + void restart() + { + started_at = current(); + } + + void start() + { + restart(); + } + + auto elapsed() const + { + return std::chrono::duration_cast(current() - started_at); + } + + auto current() const + { + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return std::chrono::nanoseconds(ts.tv_sec * 1000000000LL + ts.tv_nsec); + } + +private: + std::chrono::nanoseconds started_at; +}; + +template +class PausableTimer +{ +public: + PausableTimer() + {} + + void Start() + { + timer.restart(); + paused = false; + } + + void Pause() + { + total += timer.elapsed(); + paused = true; + } + + auto GetTotalElapsed() const + { + if (paused) + { + return total; + } + else + { + return total + timer.elapsed(); + } + } + + void Reset() + { + Pause(); + total = ChronoDurationType{0}; + } + + Timer timer; + ChronoDurationType total = ChronoDurationType{0}; + bool paused = false; +}; + +template +void TestItemLoadingAndSaving(ColumnType && col) +{ + const size_t ITEMS_COUNT = 1'000'000; + const int LOAD_AND_SAVE_REPEAT_TIMES = 10; // run Load() and Save() multiple times to cancel out measurement errors. + + std::cerr << "\n===========================================================" << std::endl; + std::cerr << "\t" << ITEMS_COUNT << " items of " << col.Type()->GetName() << std::endl; + + PausableTimer timer; + + timer.Start(); + for (size_t i = 0; i < ITEMS_COUNT; ++i) + { + const auto value = generate(col, i); + col.Append(value); + } + + EXPECT_EQ(ITEMS_COUNT, col.Size()); + std::cerr << "Appending:\t" << timer.GetTotalElapsed().count() << " us" + << std::endl; + + auto validate = [=, &col = std::as_const(col), &timer]() + { + timer.Reset(); + timer.Start(); + + // validate that appended items match expected + for (size_t i = 0; i < ITEMS_COUNT; ++i) + { + SCOPED_TRACE(i); + + ASSERT_EQ(col.At(i), generate(col, i)); + ASSERT_EQ(col[i], generate(col, i)); + } + + std::cerr << "Accessing (twice):\t" << timer.GetTotalElapsed().count() << " us" + << std::endl; + }; + + validate(); + EXPECT_NO_FATAL_FAILURE(); + + Buffer buffer; + + // Save + { + timer.Reset(); + + for (int i = 0; i < LOAD_AND_SAVE_REPEAT_TIMES; ++i) + { + buffer.clear(); + BufferOutput bufferOutput(&buffer); + CodedOutputStream ostr(&bufferOutput); + + timer.Start(); + col.Save(&ostr); + ostr.Flush(); + timer.Pause(); + } + const auto elapsed = timer.GetTotalElapsed() / (LOAD_AND_SAVE_REPEAT_TIMES * 1.0); + + std::cerr << "Saving:\t" << elapsed.count() << " us" + << std::endl; + } + + // Load + { + timer.Reset(); + + for (int i = 0; i < LOAD_AND_SAVE_REPEAT_TIMES; ++i) + { + ArrayInput arrayInput(buffer.data(), buffer.size()); + CodedInputStream istr(&arrayInput); + col.Clear(); + + timer.Start(); + col.Load(&istr, ITEMS_COUNT); + timer.Pause(); + } + const auto elapsed = timer.GetTotalElapsed() / (LOAD_AND_SAVE_REPEAT_TIMES * 1.0); + + std::cerr << "Loading:\t" << elapsed.count() << " us" + << std::endl; + } + + validate(); + EXPECT_NO_FATAL_FAILURE(); +} + +//// test deserialization of the FixedString column +TEST(ColumnsCase, PERFORMANCE_FixedString) { + TestItemLoadingAndSaving(ColumnFixedString(8)); +} + +TEST(ColumnsCase, PERFORMANCE_String) { + + TestItemLoadingAndSaving(ColumnString()); +} + +TEST(ColumnsCase, PERFORMANCE_Int) { + + TestItemLoadingAndSaving(ColumnUInt64()); +}