Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ set(ICEBERG_SOURCES
util/decimal.cc
util/gzip_internal.cc
util/murmurhash3_internal.cc
util/property_util.cc
util/snapshot_util.cc
util/temporal_util.cc
util/timepoint.cc
Expand Down
71 changes: 60 additions & 11 deletions src/iceberg/catalog/memory/in_memory_catalog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
#include "iceberg/table_identifier.h"
#include "iceberg/table_metadata.h"
#include "iceberg/table_requirement.h"
#include "iceberg/table_requirements.h"
#include "iceberg/table_update.h"
#include "iceberg/transaction.h"
#include "iceberg/util/macros.h"

namespace iceberg {
Expand Down Expand Up @@ -318,7 +320,7 @@ Result<std::string> InMemoryNamespace::GetTableMetadataLocation(
ICEBERG_RETURN_UNEXPECTED(ns);
const auto it = ns.value()->table_metadata_locations_.find(table_ident.name);
if (it == ns.value()->table_metadata_locations_.end()) {
return NotFound("{} does not exist", table_ident.name);
return NotFound("Table does not exist: {}", table_ident);
}
return it->second;
}
Expand Down Expand Up @@ -405,32 +407,66 @@ Result<std::shared_ptr<Table>> InMemoryCatalog::CreateTable(
const std::string& location,
const std::unordered_map<std::string, std::string>& properties) {
std::unique_lock lock(mutex_);
return NotImplemented("create table");
if (root_namespace_->TableExists(identifier).value_or(false)) {
return AlreadyExists("Table already exists: {}", identifier);
}

std::string base_location =
location.empty() ? warehouse_location_ + "/" + identifier.ToString() : location;

ICEBERG_ASSIGN_OR_RAISE(auto table_metadata, TableMetadata::Make(*schema, *spec, *order,
location, properties));

ICEBERG_ASSIGN_OR_RAISE(
auto metadata_file_location,
TableMetadataUtil::Write(*file_io_, nullptr, "", *table_metadata));
ICEBERG_RETURN_UNEXPECTED(
root_namespace_->UpdateTableMetadataLocation(identifier, metadata_file_location));
return Table::Make(identifier, std::move(table_metadata),
std::move(metadata_file_location), file_io_,
std::static_pointer_cast<Catalog>(shared_from_this()));
}

Result<std::shared_ptr<Table>> InMemoryCatalog::UpdateTable(
const TableIdentifier& identifier,
const std::vector<std::unique_ptr<TableRequirement>>& requirements,
const std::vector<std::unique_ptr<TableUpdate>>& updates) {
std::unique_lock lock(mutex_);
ICEBERG_ASSIGN_OR_RAISE(auto base_metadata_location,
root_namespace_->GetTableMetadataLocation(identifier));

ICEBERG_ASSIGN_OR_RAISE(auto base,
TableMetadataUtil::Read(*file_io_, base_metadata_location));
auto base_metadata_location = root_namespace_->GetTableMetadataLocation(identifier);
std::unique_ptr<TableMetadata> base;
std::unique_ptr<TableMetadataBuilder> builder;
ICEBERG_ASSIGN_OR_RAISE(auto is_create, TableRequirements::IsCreate(requirements));
if (is_create) {
if (base_metadata_location.has_value()) {
return AlreadyExists("Table already exists: {}", identifier);
}
int8_t format_version = TableMetadata::kDefaultTableFormatVersion;
for (const auto& update : updates) {
if (update->kind() == TableUpdate::Kind::kUpgradeFormatVersion) {
format_version =
dynamic_cast<const table::UpgradeFormatVersion&>(*update).format_version();
}
}
builder = TableMetadataBuilder::BuildFromEmpty(format_version);
} else {
ICEBERG_RETURN_UNEXPECTED(base_metadata_location);
ICEBERG_ASSIGN_OR_RAISE(
base, TableMetadataUtil::Read(*file_io_, base_metadata_location.value()));
builder = TableMetadataBuilder::BuildFrom(base.get());
}

for (const auto& requirement : requirements) {
ICEBERG_RETURN_UNEXPECTED(requirement->Validate(base.get()));
}

auto builder = TableMetadataBuilder::BuildFrom(base.get());
for (const auto& update : updates) {
update->ApplyTo(*builder);
}
ICEBERG_ASSIGN_OR_RAISE(auto updated, builder->Build());
ICEBERG_ASSIGN_OR_RAISE(
auto new_metadata_location,
TableMetadataUtil::Write(*file_io_, base.get(), base_metadata_location, *updated));
TableMetadataUtil::Write(*file_io_, base.get(), base_metadata_location.value(),
*updated));
ICEBERG_RETURN_UNEXPECTED(
root_namespace_->UpdateTableMetadataLocation(identifier, new_metadata_location));
TableMetadataUtil::DeleteRemovedMetadataFiles(*file_io_, base.get(), *updated);
Expand All @@ -445,7 +481,20 @@ Result<std::shared_ptr<Transaction>> InMemoryCatalog::StageCreateTable(
const std::string& location,
const std::unordered_map<std::string, std::string>& properties) {
std::unique_lock lock(mutex_);
return NotImplemented("stage create table");
if (root_namespace_->TableExists(identifier).value_or(false)) {
return AlreadyExists("Table already exists: {}", identifier);
}

std::string base_location =
location.empty() ? warehouse_location_ + "/" + identifier.ToString() : location;

ICEBERG_ASSIGN_OR_RAISE(
auto table_metadata,
TableMetadata::Make(*schema, *spec, *order, base_location, properties));
ICEBERG_ASSIGN_OR_RAISE(
auto table, StagedTable::Make(identifier, std::move(table_metadata), "", file_io_,
shared_from_this()));
return Transaction::Make(std::move(table), Transaction::Kind::kCreate, false);
}

Result<bool> InMemoryCatalog::TableExists(const TableIdentifier& identifier) const {
Expand Down Expand Up @@ -495,7 +544,7 @@ Result<std::shared_ptr<Table>> InMemoryCatalog::RegisterTable(

std::unique_lock lock(mutex_);
if (!root_namespace_->NamespaceExists(identifier.ns)) {
return NoSuchNamespace("table namespace does not exist.");
return NoSuchNamespace("Table namespace does not exist: {}", identifier.ns);
}
if (!root_namespace_->RegisterTable(identifier, metadata_file_location)) {
return UnknownError("The registry failed.");
Expand Down
1 change: 1 addition & 0 deletions src/iceberg/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ iceberg_sources = files(
'util/decimal.cc',
'util/gzip_internal.cc',
'util/murmurhash3_internal.cc',
'util/property_util.cc',
'util/snapshot_util.cc',
'util/temporal_util.cc',
'util/timepoint.cc',
Expand Down
32 changes: 32 additions & 0 deletions src/iceberg/table_identifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
/// \file iceberg/table_identifier.h
/// A TableIdentifier is a unique identifier for a table

#include <format>
#include <sstream>
#include <string>
#include <vector>

Expand All @@ -35,6 +37,15 @@ struct ICEBERG_EXPORT Namespace {
std::vector<std::string> levels;

bool operator==(const Namespace& other) const { return levels == other.levels; }

std::string ToString() const {
std::ostringstream oss;
for (size_t i = 0; i < levels.size(); ++i) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

formatter_internal.h has a FormatRange to implement this. Using it requires adding a table_identifiler.cc since we cannot export internal header here.

if (i) oss << '.';
oss << levels[i];
}
return oss.str();
}
};

/// \brief Identifies a table in iceberg catalog.
Expand All @@ -53,6 +64,27 @@ struct ICEBERG_EXPORT TableIdentifier {
}
return {};
}

std::string ToString() const { return ns.ToString() + '.' + name; }
};

} // namespace iceberg

namespace std {

template <>
struct formatter<iceberg::Namespace> : std::formatter<std::string> {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need to explicitly add these as they are automatically supported by including iceberg/util/formatter.h.

constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
auto format(const iceberg::Namespace& ns, format_context& ctx) const {
return std::formatter<std::string>::format(ns.ToString(), ctx);
}
};

template <>
struct formatter<iceberg::TableIdentifier> : std::formatter<std::string> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
auto format(const iceberg::TableIdentifier& id, format_context& ctx) const {
return std::formatter<std::string>::format(id.ToString(), ctx);
}
};
} // namespace std
117 changes: 116 additions & 1 deletion src/iceberg/table_metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "iceberg/table_metadata.h"

#include <algorithm>
#include <atomic>
#include <charconv>
#include <chrono>
#include <cstdint>
Expand All @@ -37,6 +38,7 @@
#include "iceberg/exception.h"
#include "iceberg/file_io.h"
#include "iceberg/json_internal.h"
#include "iceberg/metrics_config.h"
#include "iceberg/partition_field.h"
#include "iceberg/partition_spec.h"
#include "iceberg/result.h"
Expand All @@ -50,12 +52,73 @@
#include "iceberg/util/gzip_internal.h"
#include "iceberg/util/location_util.h"
#include "iceberg/util/macros.h"
#include "iceberg/util/property_util.h"
#include "iceberg/util/type_util.h"
#include "iceberg/util/uuid.h"
namespace iceberg {
namespace {
const TimePointMs kInvalidLastUpdatedMs = TimePointMs::min();
constexpr int32_t kLastAdded = -1;
constexpr std::string_view kMetadataFolderName = "metadata";

// TableMetadata private static methods
Result<std::shared_ptr<PartitionSpec>> FreshPartitionSpec(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think FreshPartitionSpec and FreshSortOrder should also be called in the SetDefaultPartitionSpec and SetDefaultSortOrder, respectively.

int32_t spec_id, const PartitionSpec& spec, const Schema& base_schema,
const Schema& fresh_schema, std::function<int32_t()> next_id) {
std::vector<PartitionField> partition_fields;
for (auto& field : spec.fields()) {
ICEBERG_ASSIGN_OR_RAISE(auto source_name,
base_schema.FindColumnNameById(field.source_id()));
int32_t source_id;
if (!source_name.has_value()) {
// In the case of a source field not found, the column has been deleted.
// This only happens in V1 tables where the reference is still around as a void
// transform
source_id = field.source_id();
} else {
ICEBERG_ASSIGN_OR_RAISE(auto fresh_field,
fresh_schema.FindFieldByName(source_name.value()));
if (!fresh_field.has_value()) [[unlikely]] {
return InvalidSchema("Partition field {} does not exist in the schema",
source_name.value());
}
source_id = fresh_field.value().get().field_id();
}
partition_fields.emplace_back(source_id, next_id ? next_id() : field.field_id(),
std::string(field.name()), field.transform());
}
return PartitionSpec::Make(fresh_schema, spec_id, std::move(partition_fields), false);
}

Result<std::shared_ptr<SortOrder>> FreshSortOrder(int32_t order_id, const Schema& schema,
const SortOrder& order) {
if (order.is_unsorted()) {
return SortOrder::Unsorted();
}

std::vector<SortField> fresh_fields;
for (const auto& field : order.fields()) {
ICEBERG_ASSIGN_OR_RAISE(auto source_name,
schema.FindColumnNameById(field.source_id()));
if (!source_name.has_value()) {
return InvalidSchema("Unable to find source field with ID {} in the old schema",
field.source_id());
}

ICEBERG_ASSIGN_OR_RAISE(auto fresh_field,
schema.FindFieldByName(source_name.value()));
if (!fresh_field.has_value()) {
return InvalidSchema("Unable to find field '{}' in the new schema",
source_name.value());
}

int32_t new_source_id = fresh_field.value().get().field_id();
fresh_fields.emplace_back(new_source_id, field.transform(), field.direction(),
field.null_order());
}

return SortOrder::Make(order_id, std::move(fresh_fields));
}
} // namespace

std::string ToString(const SnapshotLogEntry& entry) {
Expand All @@ -68,6 +131,53 @@ std::string ToString(const MetadataLogEntry& entry) {
entry.metadata_file);
}

Result<std::unique_ptr<TableMetadata>> TableMetadata::Make(
const iceberg::Schema& schema, const iceberg::PartitionSpec& spec,
const iceberg::SortOrder& sort_order, const std::string& location,
const std::unordered_map<std::string, std::string>& properties, int format_version) {
for (const auto& [key, _] : properties) {
if (TableProperties::reserved_properties().contains(key)) {
return InvalidArgument(
"Table properties should not contain reserved properties, but got {}", key);
}
}

// Reassign all column ids to ensure consistency
std::atomic<int32_t> last_column_id = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to use atomic here?

auto next_id = [&last_column_id]() -> int32_t { return ++last_column_id; };
ICEBERG_ASSIGN_OR_RAISE(auto fresh_schema,
AssignFreshIds(Schema::kInitialSchemaId, schema, next_id));

// Rebuild the partition spec using the new column ids
std::atomic<int32_t> last_partition_field_id = PartitionSpec::kInvalidPartitionFieldId;
auto next_partition_field_id = [&last_partition_field_id]() -> int32_t {
return ++last_partition_field_id;
};
ICEBERG_ASSIGN_OR_RAISE(auto fresh_spec,
FreshPartitionSpec(PartitionSpec::kInitialSpecId, spec, schema,
*fresh_schema, next_partition_field_id));

// rebuild the sort order using the new column ids
int32_t fresh_order_id =
sort_order.is_unsorted() ? sort_order.order_id() : SortOrder::kInitialSortOrderId;
ICEBERG_ASSIGN_OR_RAISE(auto fresh_order,
FreshSortOrder(fresh_order_id, *fresh_schema, sort_order))

// Validata the metrics configuration.
ICEBERG_RETURN_UNEXPECTED(
MetricsConfig::VerifyReferencedColumns(properties, *fresh_schema));

PropertyUtil::ValidateCommitProperties(properties);

return TableMetadataBuilder::BuildFromEmpty(format_version)
->SetLocation(location)
.SetCurrentSchema(std::move(fresh_schema), last_column_id.load())
.SetDefaultPartitionSpec(std::move(fresh_spec))
.SetDefaultSortOrder(std::move(fresh_order))
.SetProperties(properties)
.Build();
}

Result<std::shared_ptr<Schema>> TableMetadata::Schema() const {
return SchemaById(current_schema_id);
}
Expand Down Expand Up @@ -408,6 +518,10 @@ class TableMetadataBuilder::Impl {
const TableMetadata* base() const { return base_; }
const TableMetadata& metadata() const { return metadata_; }

void SetLocation(std::string_view location) {
metadata_.location = std::string(location);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might need to follow the java impl to validate the input and then produce a update to the changes list.

We might need to follow the Java impl to validate the input location and then produce a TableUpdate to the changes list.

```java
    public Builder setLocation(String newLocation) {
      if (location != null && location.equals(newLocation)) {
        return this;
      }

      this.location = newLocation;
      changes.add(new MetadataUpdate.SetLocation(newLocation));

      return this;
    }

}

void SetMetadataLocation(std::string_view metadata_location) {
metadata_location_ = std::string(metadata_location);
if (base_ != nullptr) {
Expand Down Expand Up @@ -917,7 +1031,8 @@ TableMetadataBuilder& TableMetadataBuilder::RemoveProperties(
}

TableMetadataBuilder& TableMetadataBuilder::SetLocation(std::string_view location) {
throw IcebergError(std::format("{} not implemented", __FUNCTION__));
impl_->SetLocation(location);
return *this;
}

TableMetadataBuilder& TableMetadataBuilder::AddEncryptionKey(
Expand Down
6 changes: 6 additions & 0 deletions src/iceberg/table_metadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@ struct ICEBERG_EXPORT TableMetadata {
/// A `long` higher than all assigned row IDs
int64_t next_row_id;

static Result<std::unique_ptr<TableMetadata>> Make(
const iceberg::Schema& schema, const iceberg::PartitionSpec& spec,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think we should rename functions below to GetSchema, GetSnapshot, etc. so we don't need to write iceberg:: prefix here and elsewhere?

const iceberg::SortOrder& sort_order, const std::string& location,
const std::unordered_map<std::string, std::string>& properties,
int format_version = kDefaultTableFormatVersion);

/// \brief Get the current schema, return NotFoundError if not found
Result<std::shared_ptr<iceberg::Schema>> Schema() const;
/// \brief Get the current schema by ID, return NotFoundError if not found
Expand Down
7 changes: 7 additions & 0 deletions src/iceberg/table_properties.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ const std::unordered_set<std::string>& TableProperties::reserved_properties() {
return kReservedProperties;
}

const std::unordered_set<std::string>& TableProperties::commit_properties() {
static const std::unordered_set<std::string> kCommitProperties = {
kCommitNumRetries.key(), kCommitMinRetryWaitMs.key(), kCommitMaxRetryWaitMs.key(),
kCommitTotalRetryTimeMs.key()};
return kCommitProperties;
}

TableProperties TableProperties::default_properties() { return {}; }

TableProperties TableProperties::FromMap(
Expand Down
3 changes: 3 additions & 0 deletions src/iceberg/table_properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,9 @@ class ICEBERG_EXPORT TableProperties : public ConfigBase<TableProperties> {
/// \return The set of reserved property keys
static const std::unordered_set<std::string>& reserved_properties();

/// \brief Get the set of commit table property keys.
static const std::unordered_set<std::string>& commit_properties();

/// \brief Create a default TableProperties instance.
///
/// \return A unique pointer to a TableProperties instance with default values
Expand Down
Loading
Loading