Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions clickhouse/base/wire_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class WireFormat {

static void WriteBytes(CodedOutputStream* output, const void* buf, size_t len);

static void WriteString(CodedOutputStream* output, const std::string& value);
static void WriteString(CodedOutputStream* output, std::string_view value);

static void WriteUInt64(CodedOutputStream* output, const uint64_t value);
};
Expand Down Expand Up @@ -85,7 +85,7 @@ inline void WireFormat::WriteBytes(

inline void WireFormat::WriteString(
CodedOutputStream* output,
const std::string& value)
std::string_view value)
{
output->WriteVarint64(value.size());
output->WriteRaw(value.data(), value.size());
Expand Down
2 changes: 1 addition & 1 deletion clickhouse/columns/ip6.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ std::string ColumnIPv6::AsString (size_t n) const{
char buf[INET6_ADDRSTRLEN];
const char* ip_str = inet_ntop(AF_INET6, addr.data(), buf, INET6_ADDRSTRLEN);
if (ip_str == nullptr) {
throw std::runtime_error("invalid IPv6 format: " + addr);
throw std::runtime_error("invalid IPv6 format: " + std::string(addr));
}
return ip_str;
}
Expand Down
199 changes: 155 additions & 44 deletions clickhouse/columns/string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,26 @@

#include "../base/wire_format.h"

namespace
{
const size_t DEFAULT_BLOCK_SIZE = 4096;

template <typename Container>
size_t ComputeTotalSize(const Container & strings, size_t begin = 0, size_t len = -1)
{
size_t result = 0;
if (begin < strings.size()) {
len = std::min(len, strings.size() - begin);

for (size_t i = begin; i < begin + len; ++i)
result += strings[i].size();
}

return result;
}

}

namespace clickhouse {

ColumnFixedString::ColumnFixedString(size_t n)
Expand All @@ -11,21 +31,29 @@ ColumnFixedString::ColumnFixedString(size_t n)
{
}

void ColumnFixedString::Append(const std::string& str) {
data_.push_back(str);
data_.back().resize(string_size_);
void ColumnFixedString::Append(std::string_view str) {
if (data_.capacity() < str.size())
{
// round up to the next block size
const auto new_size = (((data_.size() + string_size_) / DEFAULT_BLOCK_SIZE) + 1) * DEFAULT_BLOCK_SIZE;
data_.reserve(new_size);
}

data_.insert(data_.size(), str);
}

void ColumnFixedString::Clear() {
data_.clear();
}

const std::string& ColumnFixedString::At(size_t n) const {
return data_.at(n);
std::string_view ColumnFixedString::At(size_t n) const {
const auto pos = n * string_size_;
return std::string_view(&data_.at(pos), string_size_);
}

const std::string& ColumnFixedString::operator [] (size_t n) const {
return data_[n];
std::string_view ColumnFixedString::operator [](size_t n) const {
const auto pos = n * string_size_;
return std::string_view(&data_[pos], string_size_);
}

size_t ColumnFixedString::FixedSize() const
Expand All @@ -42,104 +70,187 @@ void ColumnFixedString::Append(ColumnRef column) {
}

bool ColumnFixedString::Load(CodedInputStream* input, size_t rows) {
data_.reserve(data_.size() + rows);

for (size_t i = 0; i < rows; ++i) {
std::string s;
s.resize(string_size_);

if (!WireFormat::ReadBytes(input, &s[0], s.size())) {
return false;
}

data_.push_back(std::move(s));
data_.resize(string_size_ * rows);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original code suggests appending to data_ (i.e., as a reader, I'll assume that there already could be some data in data_), the new code writes from the beginning as if there is no data in data_ ever, at this point. Pointing this out to just get a confirmation, that this new behavior is correct conceptually.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, resizing a string assumes implicit zero-filling, which could become significant, but I am not sure there is a quick and easy way to avoid it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unfortunately there is no easy way (not even hard way) to avoid pre-filling std::string, and that is why I use Block for ColumnString below.

As for assumptions: somehow those assumptions are opposite for rest of column types, which erase previous data. So I had to unify behaviors.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, but here, that zero-filling still takes place.

If not now, maybe in future, this could be used. Looks like the header itself is standalone:
https://github.com/facebook/folly/blob/master/folly/memory/UninitializedMemoryHacks.h

Copy link
Contributor Author

@Enmk Enmk Feb 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this is not a big problem for FixedString, but yes, we might want to use Block here too/ That facebook hack...

if (!WireFormat::ReadBytes(input, &data_[0], data_.size())) {
return false;
}

return true;
}

void ColumnFixedString::Save(CodedOutputStream* output) {
for (size_t i = 0; i < data_.size(); ++i) {
WireFormat::WriteBytes(output, data_[i].data(), string_size_);
}
WireFormat::WriteBytes(output, data_.data(), data_.size());
}

size_t ColumnFixedString::Size() const {
return data_.size();
return data_.size() / string_size_;
}

ColumnRef ColumnFixedString::Slice(size_t begin, size_t len) {
auto result = std::make_shared<ColumnFixedString>(string_size_);

if (begin < data_.size()) {
result->data_ = SliceVector(data_, begin, len);
if (begin < Size()) {
const auto b = begin * string_size_;
const auto l = len * string_size_;
result->data_ = data_.substr(b, std::min(data_.size() - b, l));
}

return result;
return std::move(result);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure this std::move is right thing to do here? Wouldn't it prevent the "natural" copy elision? Take a look at this: https://stackoverflow.com/questions/17473753/c11-return-value-optimization-or-move

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm, let me check

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good find, thank you!

}

struct ColumnString::Block
{
using CharT = typename std::string::value_type;

explicit Block(size_t starting_capacity)
: size(0),
capacity(starting_capacity),
data_(new CharT[capacity])
{}

inline auto GetAvailble() const
{
return capacity - size;
}

std::string_view AppendUnsafe(std::string_view str)
{
const auto pos = &data_[size];

memcpy(pos, str.data(), str.size());
size += str.size();

return std::string_view(pos, str.size());
}

auto GetCurrentWritePos()
{
return &data_[size];
}

std::string_view ConsumeTailAsStringViewUnsafe(size_t len)
{
const auto start = &data_[size];
size += len;
return std::string_view(start, len);
}

size_t size;
const size_t capacity;
std::unique_ptr<CharT[]> data_;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not std::string? With its capacity and size management.

Copy link
Contributor Author

@Enmk Enmk Feb 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wish I could use std::string here, the problem is that there is no way of pre-allocating buffer without pre-filling it with data.

};

ColumnString::ColumnString()
: Column(Type::CreateString())
{
}

ColumnString::ColumnString(const std::vector<std::string>& data)
ColumnString::ColumnString(const std::vector<std::string> & data)
: Column(Type::CreateString())
, data_(data)
{
items_.reserve(data.size());
blocks_.emplace_back(ComputeTotalSize(data));

for (const auto & s : data)
{
AppendUnsafe(s);
}
}

void ColumnString::Append(const std::string& str) {
data_.push_back(str);
ColumnString::~ColumnString()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like an unused leftover from initial edits?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, just a trick to explicitly instantiate d-tor (and hence d-tor of std::unique_ptr<Block[]>) in the context where Block is defined. Otherwise said unique_ptr fails to compile.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, strange. Doesn't ~ColumnString() = default help?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC that does not prescribe where the d-tor is instantiated, which might be anywhere, like in every other translation unit.

{}

void ColumnString::Append(std::string_view str) {
if (blocks_.size() == 0 || blocks_.back().GetAvailble() < str.length())
{
blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, str.size()));
}

items_.emplace_back(blocks_.back().AppendUnsafe(str));
}

void ColumnString::AppendUnsafe(std::string_view str)
{
items_.emplace_back(blocks_.back().AppendUnsafe(str));
}

void ColumnString::Clear() {
data_.clear();
items_.clear();
blocks_.clear();
}

const std::string& ColumnString::At(size_t n) const {
return data_.at(n);
std::string_view ColumnString::At(size_t n) const {
return items_.at(n);
}

const std::string& ColumnString::operator [] (size_t n) const {
return data_[n];
std::string_view ColumnString::operator [] (size_t n) const {
return items_[n];
}

void ColumnString::Append(ColumnRef column) {
if (auto col = column->As<ColumnString>()) {
data_.insert(data_.end(), col->data_.begin(), col->data_.end());
const auto total_size = ComputeTotalSize(col->items_);

// TODO: fill up existing block with some items and then add a new one for the rest of items
if (blocks_.size() == 0 || blocks_.back().GetAvailble() < total_size)
blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, total_size));
items_.reserve(items_.size() + col->Size());

for (size_t i = 0; i < column->Size(); ++i) {
this->AppendUnsafe((*col)[i]);
}
}
}

bool ColumnString::Load(CodedInputStream* input, size_t rows) {
data_.reserve(data_.size() + rows);
items_.clear();
blocks_.clear();

items_.reserve(rows);
Block * block = nullptr;

// TODO(performance): unroll a loop to a first row (to get rid of `blocks_.size() == 0` check) and the rest.
for (size_t i = 0; i < rows; ++i) {
std::string s;
uint64_t len;
if (!WireFormat::ReadUInt64(input, &len))
return false;

if (blocks_.size() == 0 || len > block->GetAvailble())
block = &blocks_.emplace_back(std::max<size_t>(DEFAULT_BLOCK_SIZE, len));

if (!WireFormat::ReadString(input, &s)) {
if (!WireFormat::ReadBytes(input, block->GetCurrentWritePos(), len))
return false;
}

data_.push_back(std::move(s));
items_.emplace_back(block->ConsumeTailAsStringViewUnsafe(len));
}

return true;
}

void ColumnString::Save(CodedOutputStream* output) {
for (auto si = data_.begin(); si != data_.end(); ++si) {
WireFormat::WriteString(output, *si);
for (const auto & item : items_) {
WireFormat::WriteString(output, item);
}
}

size_t ColumnString::Size() const {
return data_.size();
return items_.size();
}

ColumnRef ColumnString::Slice(size_t begin, size_t len) {
return std::make_shared<ColumnString>(SliceVector(data_, begin, len));
auto result = std::make_shared<ColumnString>();

if (begin < items_.size()) {
len = std::min(len, items_.size() - begin);

result->blocks_.emplace_back(ComputeTotalSize(items_, begin, len));
for (size_t i = begin; i < begin + len; ++i)
{
result->Append(items_[i]);
}
}

return result;
}

}
31 changes: 22 additions & 9 deletions clickhouse/columns/string.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

#include "column.h"

#include <string>
#include <string_view>
#include <utility>
#include <vector>

namespace clickhouse {

/**
Expand All @@ -12,13 +17,13 @@ class ColumnFixedString : public Column {
explicit ColumnFixedString(size_t n);

/// Appends one element to the column.
void Append(const std::string& str);
void Append(std::string_view str);

/// Returns element at given row number.
const std::string& At(size_t n) const;
std::string_view At(size_t n) const;

/// Returns element at given row number.
const std::string& operator [] (size_t n) const;
std::string_view operator [] (size_t n) const;

/// Returns the max size of the fixed string
size_t FixedSize() const;
Expand All @@ -44,7 +49,7 @@ class ColumnFixedString : public Column {

private:
const size_t string_size_;
std::vector<std::string> data_;
std::string data_;
};

/**
Expand All @@ -53,16 +58,18 @@ class ColumnFixedString : public Column {
class ColumnString : public Column {
public:
ColumnString();
explicit ColumnString(const std::vector<std::string>& data);
~ColumnString();

explicit ColumnString(const std::vector<std::string> & data);

/// Appends one element to the column.
void Append(const std::string& str);
void Append(std::string_view str);

/// Returns element at given row number.
const std::string& At(size_t n) const;
std::string_view At(size_t n) const;

/// Returns element at given row number.
const std::string& operator [] (size_t n) const;
std::string_view operator [] (size_t n) const;

public:
/// Appends content of given column to the end of current one.
Expand All @@ -84,7 +91,13 @@ class ColumnString : public Column {
ColumnRef Slice(size_t begin, size_t len) override;

private:
std::vector<std::string> data_;
void AppendUnsafe(std::string_view);

private:
struct Block;

std::vector<std::string_view> items_;
std::vector<Block> blocks_;
};

}
Loading