-
Notifications
You must be signed in to change notification settings - Fork 187
ColumnString and ColumnFixedString performance fix #29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
4f55319
c2dd41b
9280c21
591fe15
0ec5bfd
5eea214
367958f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,26 @@ | |
|
||
#include "../base/wire_format.h" | ||
|
||
namespace | ||
{ | ||
const size_t DEFAULT_BLOCK_SIZE = 4096; | ||
|
||
template <typename Container> | ||
size_t ComputeTotalSize(const Container & strings, size_t begin = 0, size_t len = -1) | ||
{ | ||
size_t result = 0; | ||
if (begin < strings.size()) { | ||
len = std::min(len, strings.size() - begin); | ||
|
||
for (size_t i = begin; i < begin + len; ++i) | ||
result += strings[i].size(); | ||
} | ||
|
||
return result; | ||
} | ||
|
||
} | ||
|
||
namespace clickhouse { | ||
|
||
ColumnFixedString::ColumnFixedString(size_t n) | ||
|
@@ -11,21 +31,29 @@ ColumnFixedString::ColumnFixedString(size_t n) | |
{ | ||
} | ||
|
||
void ColumnFixedString::Append(const std::string& str) { | ||
data_.push_back(str); | ||
data_.back().resize(string_size_); | ||
void ColumnFixedString::Append(std::string_view str) { | ||
if (data_.capacity() < str.size()) | ||
{ | ||
// round up to the next block size | ||
const auto new_size = (((data_.size() + string_size_) / DEFAULT_BLOCK_SIZE) + 1) * DEFAULT_BLOCK_SIZE; | ||
data_.reserve(new_size); | ||
} | ||
|
||
data_.insert(data_.size(), str); | ||
} | ||
|
||
void ColumnFixedString::Clear() { | ||
data_.clear(); | ||
} | ||
|
||
const std::string& ColumnFixedString::At(size_t n) const { | ||
return data_.at(n); | ||
std::string_view ColumnFixedString::At(size_t n) const { | ||
const auto pos = n * string_size_; | ||
return std::string_view(&data_.at(pos), string_size_); | ||
} | ||
|
||
const std::string& ColumnFixedString::operator [] (size_t n) const { | ||
return data_[n]; | ||
std::string_view ColumnFixedString::operator [](size_t n) const { | ||
const auto pos = n * string_size_; | ||
return std::string_view(&data_[pos], string_size_); | ||
} | ||
|
||
size_t ColumnFixedString::FixedSize() const | ||
|
@@ -42,104 +70,187 @@ void ColumnFixedString::Append(ColumnRef column) { | |
} | ||
|
||
bool ColumnFixedString::Load(CodedInputStream* input, size_t rows) { | ||
data_.reserve(data_.size() + rows); | ||
|
||
for (size_t i = 0; i < rows; ++i) { | ||
std::string s; | ||
s.resize(string_size_); | ||
|
||
if (!WireFormat::ReadBytes(input, &s[0], s.size())) { | ||
return false; | ||
} | ||
|
||
data_.push_back(std::move(s)); | ||
data_.resize(string_size_ * rows); | ||
if (!WireFormat::ReadBytes(input, &data_[0], data_.size())) { | ||
return false; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
void ColumnFixedString::Save(CodedOutputStream* output) { | ||
for (size_t i = 0; i < data_.size(); ++i) { | ||
WireFormat::WriteBytes(output, data_[i].data(), string_size_); | ||
} | ||
WireFormat::WriteBytes(output, data_.data(), data_.size()); | ||
} | ||
|
||
size_t ColumnFixedString::Size() const { | ||
return data_.size(); | ||
return data_.size() / string_size_; | ||
} | ||
|
||
ColumnRef ColumnFixedString::Slice(size_t begin, size_t len) { | ||
auto result = std::make_shared<ColumnFixedString>(string_size_); | ||
|
||
if (begin < data_.size()) { | ||
result->data_ = SliceVector(data_, begin, len); | ||
if (begin < Size()) { | ||
const auto b = begin * string_size_; | ||
const auto l = len * string_size_; | ||
result->data_ = data_.substr(b, std::min(data_.size() - b, l)); | ||
} | ||
|
||
return result; | ||
return std::move(result); | ||
|
||
} | ||
|
||
struct ColumnString::Block | ||
{ | ||
using CharT = typename std::string::value_type; | ||
|
||
explicit Block(size_t starting_capacity) | ||
: size(0), | ||
capacity(starting_capacity), | ||
data_(new CharT[capacity]) | ||
{} | ||
|
||
inline auto GetAvailble() const | ||
{ | ||
return capacity - size; | ||
} | ||
|
||
std::string_view AppendUnsafe(std::string_view str) | ||
{ | ||
const auto pos = &data_[size]; | ||
|
||
memcpy(pos, str.data(), str.size()); | ||
size += str.size(); | ||
|
||
return std::string_view(pos, str.size()); | ||
} | ||
|
||
auto GetCurrentWritePos() | ||
{ | ||
return &data_[size]; | ||
} | ||
|
||
std::string_view ConsumeTailAsStringViewUnsafe(size_t len) | ||
{ | ||
const auto start = &data_[size]; | ||
size += len; | ||
return std::string_view(start, len); | ||
} | ||
|
||
size_t size; | ||
const size_t capacity; | ||
std::unique_ptr<CharT[]> data_; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wish I could use std::string here, the problem is that there is no way of pre-allocating buffer without pre-filling it with data. |
||
}; | ||
|
||
ColumnString::ColumnString() | ||
: Column(Type::CreateString()) | ||
{ | ||
} | ||
|
||
ColumnString::ColumnString(const std::vector<std::string>& data) | ||
ColumnString::ColumnString(const std::vector<std::string> & data) | ||
: Column(Type::CreateString()) | ||
, data_(data) | ||
{ | ||
items_.reserve(data.size()); | ||
blocks_.emplace_back(ComputeTotalSize(data)); | ||
|
||
for (const auto & s : data) | ||
{ | ||
AppendUnsafe(s); | ||
} | ||
} | ||
|
||
void ColumnString::Append(const std::string& str) { | ||
data_.push_back(str); | ||
ColumnString::~ColumnString() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like an unused leftover from initial edits? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nope, just a trick to explicitly instantiate d-tor (and hence d-tor of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, strange. Doesn't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIRC that does not prescribe where the d-tor is instantiated, which might be anywhere, like in every other translation unit. |
||
{} | ||
|
||
void ColumnString::Append(std::string_view str) { | ||
if (blocks_.size() == 0 || blocks_.back().GetAvailble() < str.length()) | ||
{ | ||
blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, str.size())); | ||
} | ||
|
||
items_.emplace_back(blocks_.back().AppendUnsafe(str)); | ||
} | ||
|
||
void ColumnString::AppendUnsafe(std::string_view str) | ||
{ | ||
items_.emplace_back(blocks_.back().AppendUnsafe(str)); | ||
} | ||
|
||
void ColumnString::Clear() { | ||
data_.clear(); | ||
items_.clear(); | ||
blocks_.clear(); | ||
} | ||
|
||
const std::string& ColumnString::At(size_t n) const { | ||
return data_.at(n); | ||
std::string_view ColumnString::At(size_t n) const { | ||
return items_.at(n); | ||
} | ||
|
||
const std::string& ColumnString::operator [] (size_t n) const { | ||
return data_[n]; | ||
std::string_view ColumnString::operator [] (size_t n) const { | ||
return items_[n]; | ||
} | ||
|
||
void ColumnString::Append(ColumnRef column) { | ||
if (auto col = column->As<ColumnString>()) { | ||
data_.insert(data_.end(), col->data_.begin(), col->data_.end()); | ||
const auto total_size = ComputeTotalSize(col->items_); | ||
|
||
// TODO: fill up existing block with some items and then add a new one for the rest of items | ||
if (blocks_.size() == 0 || blocks_.back().GetAvailble() < total_size) | ||
blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, total_size)); | ||
items_.reserve(items_.size() + col->Size()); | ||
|
||
for (size_t i = 0; i < column->Size(); ++i) { | ||
this->AppendUnsafe((*col)[i]); | ||
} | ||
} | ||
} | ||
|
||
bool ColumnString::Load(CodedInputStream* input, size_t rows) { | ||
data_.reserve(data_.size() + rows); | ||
items_.clear(); | ||
blocks_.clear(); | ||
|
||
items_.reserve(rows); | ||
Block * block = nullptr; | ||
|
||
// TODO(performance): unroll a loop to a first row (to get rid of `blocks_.size() == 0` check) and the rest. | ||
for (size_t i = 0; i < rows; ++i) { | ||
std::string s; | ||
uint64_t len; | ||
if (!WireFormat::ReadUInt64(input, &len)) | ||
return false; | ||
|
||
if (blocks_.size() == 0 || len > block->GetAvailble()) | ||
block = &blocks_.emplace_back(std::max<size_t>(DEFAULT_BLOCK_SIZE, len)); | ||
|
||
if (!WireFormat::ReadString(input, &s)) { | ||
if (!WireFormat::ReadBytes(input, block->GetCurrentWritePos(), len)) | ||
return false; | ||
} | ||
|
||
data_.push_back(std::move(s)); | ||
items_.emplace_back(block->ConsumeTailAsStringViewUnsafe(len)); | ||
} | ||
|
||
return true; | ||
} | ||
|
||
void ColumnString::Save(CodedOutputStream* output) { | ||
for (auto si = data_.begin(); si != data_.end(); ++si) { | ||
WireFormat::WriteString(output, *si); | ||
for (const auto & item : items_) { | ||
WireFormat::WriteString(output, item); | ||
} | ||
} | ||
|
||
size_t ColumnString::Size() const { | ||
return data_.size(); | ||
return items_.size(); | ||
} | ||
|
||
ColumnRef ColumnString::Slice(size_t begin, size_t len) { | ||
return std::make_shared<ColumnString>(SliceVector(data_, begin, len)); | ||
auto result = std::make_shared<ColumnString>(); | ||
|
||
if (begin < items_.size()) { | ||
len = std::min(len, items_.size() - begin); | ||
|
||
result->blocks_.emplace_back(ComputeTotalSize(items_, begin, len)); | ||
for (size_t i = begin; i < begin + len; ++i) | ||
{ | ||
result->Append(items_[i]); | ||
} | ||
} | ||
|
||
return result; | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The original code suggests appending to
data_
(i.e., as a reader, I'll assume that there already could be some data indata_
), the new code writes from the beginning as if there is no data indata_
ever, at this point. Pointing this out to just get a confirmation, that this new behavior is correct conceptually.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, resizing a string assumes implicit zero-filling, which could become significant, but I am not sure there is a quick and easy way to avoid it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unfortunately there is no easy way (not even hard way) to avoid pre-filling std::string, and that is why I use
Block
forColumnString
below.As for assumptions: somehow those assumptions are opposite for rest of column types, which erase previous data. So I had to unify behaviors.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, but here, that zero-filling still takes place.
If not now, maybe in future, this could be used. Looks like the header itself is standalone:
https://github.com/facebook/folly/blob/master/folly/memory/UninitializedMemoryHacks.h
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess this is not a big problem for
FixedString
, but yes, we might want to use Block here too/ That facebook hack...