-
Notifications
You must be signed in to change notification settings - Fork 76
WIP feat: InMemoryCatalog create table #416
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,8 @@ | |
| /// \file iceberg/table_identifier.h | ||
| /// A TableIdentifier is a unique identifier for a table | ||
|
|
||
| #include <format> | ||
| #include <sstream> | ||
| #include <string> | ||
| #include <vector> | ||
|
|
||
|
|
@@ -35,6 +37,15 @@ struct ICEBERG_EXPORT Namespace { | |
| std::vector<std::string> levels; | ||
|
|
||
| bool operator==(const Namespace& other) const { return levels == other.levels; } | ||
|
|
||
| std::string ToString() const { | ||
| std::ostringstream oss; | ||
| for (size_t i = 0; i < levels.size(); ++i) { | ||
| if (i) oss << '.'; | ||
| oss << levels[i]; | ||
| } | ||
| return oss.str(); | ||
| } | ||
| }; | ||
|
|
||
| /// \brief Identifies a table in iceberg catalog. | ||
|
|
@@ -53,6 +64,27 @@ struct ICEBERG_EXPORT TableIdentifier { | |
| } | ||
| return {}; | ||
| } | ||
|
|
||
| std::string ToString() const { return ns.ToString() + '.' + name; } | ||
| }; | ||
|
|
||
| } // namespace iceberg | ||
|
|
||
| namespace std { | ||
|
|
||
| template <> | ||
| struct formatter<iceberg::Namespace> : std::formatter<std::string> { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't need to explicitly add these as they are automatically supported by including |
||
| constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); } | ||
| auto format(const iceberg::Namespace& ns, format_context& ctx) const { | ||
| return std::formatter<std::string>::format(ns.ToString(), ctx); | ||
| } | ||
| }; | ||
|
|
||
| template <> | ||
| struct formatter<iceberg::TableIdentifier> : std::formatter<std::string> { | ||
| constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); } | ||
| auto format(const iceberg::TableIdentifier& id, format_context& ctx) const { | ||
| return std::formatter<std::string>::format(id.ToString(), ctx); | ||
| } | ||
| }; | ||
| } // namespace std | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ | |
| #include "iceberg/table_metadata.h" | ||
|
|
||
| #include <algorithm> | ||
| #include <atomic> | ||
| #include <charconv> | ||
| #include <chrono> | ||
| #include <cstdint> | ||
|
|
@@ -37,6 +38,7 @@ | |
| #include "iceberg/exception.h" | ||
| #include "iceberg/file_io.h" | ||
| #include "iceberg/json_internal.h" | ||
| #include "iceberg/metrics_config.h" | ||
| #include "iceberg/partition_field.h" | ||
| #include "iceberg/partition_spec.h" | ||
| #include "iceberg/result.h" | ||
|
|
@@ -50,12 +52,73 @@ | |
| #include "iceberg/util/gzip_internal.h" | ||
| #include "iceberg/util/location_util.h" | ||
| #include "iceberg/util/macros.h" | ||
| #include "iceberg/util/property_util.h" | ||
| #include "iceberg/util/type_util.h" | ||
| #include "iceberg/util/uuid.h" | ||
| namespace iceberg { | ||
| namespace { | ||
| const TimePointMs kInvalidLastUpdatedMs = TimePointMs::min(); | ||
| constexpr int32_t kLastAdded = -1; | ||
| constexpr std::string_view kMetadataFolderName = "metadata"; | ||
|
|
||
| // TableMetadata private static methods | ||
| Result<std::shared_ptr<PartitionSpec>> FreshPartitionSpec( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think |
||
| int32_t spec_id, const PartitionSpec& spec, const Schema& base_schema, | ||
| const Schema& fresh_schema, std::function<int32_t()> next_id) { | ||
| std::vector<PartitionField> partition_fields; | ||
| for (auto& field : spec.fields()) { | ||
| ICEBERG_ASSIGN_OR_RAISE(auto source_name, | ||
| base_schema.FindColumnNameById(field.source_id())); | ||
| int32_t source_id; | ||
| if (!source_name.has_value()) { | ||
| // In the case of a source field not found, the column has been deleted. | ||
| // This only happens in V1 tables where the reference is still around as a void | ||
| // transform | ||
| source_id = field.source_id(); | ||
| } else { | ||
| ICEBERG_ASSIGN_OR_RAISE(auto fresh_field, | ||
| fresh_schema.FindFieldByName(source_name.value())); | ||
| if (!fresh_field.has_value()) [[unlikely]] { | ||
| return InvalidSchema("Partition field {} does not exist in the schema", | ||
| source_name.value()); | ||
| } | ||
| source_id = fresh_field.value().get().field_id(); | ||
| } | ||
| partition_fields.emplace_back(source_id, next_id ? next_id() : field.field_id(), | ||
| std::string(field.name()), field.transform()); | ||
| } | ||
| return PartitionSpec::Make(fresh_schema, spec_id, std::move(partition_fields), false); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<SortOrder>> FreshSortOrder(int32_t order_id, const Schema& schema, | ||
| const SortOrder& order) { | ||
| if (order.is_unsorted()) { | ||
| return SortOrder::Unsorted(); | ||
| } | ||
|
|
||
| std::vector<SortField> fresh_fields; | ||
| for (const auto& field : order.fields()) { | ||
| ICEBERG_ASSIGN_OR_RAISE(auto source_name, | ||
| schema.FindColumnNameById(field.source_id())); | ||
| if (!source_name.has_value()) { | ||
| return InvalidSchema("Unable to find source field with ID {} in the old schema", | ||
| field.source_id()); | ||
| } | ||
|
|
||
| ICEBERG_ASSIGN_OR_RAISE(auto fresh_field, | ||
| schema.FindFieldByName(source_name.value())); | ||
| if (!fresh_field.has_value()) { | ||
| return InvalidSchema("Unable to find field '{}' in the new schema", | ||
| source_name.value()); | ||
| } | ||
|
|
||
| int32_t new_source_id = fresh_field.value().get().field_id(); | ||
| fresh_fields.emplace_back(new_source_id, field.transform(), field.direction(), | ||
| field.null_order()); | ||
| } | ||
|
|
||
| return SortOrder::Make(order_id, std::move(fresh_fields)); | ||
| } | ||
| } // namespace | ||
|
|
||
| std::string ToString(const SnapshotLogEntry& entry) { | ||
|
|
@@ -68,6 +131,53 @@ std::string ToString(const MetadataLogEntry& entry) { | |
| entry.metadata_file); | ||
| } | ||
|
|
||
| Result<std::unique_ptr<TableMetadata>> TableMetadata::Make( | ||
| const iceberg::Schema& schema, const iceberg::PartitionSpec& spec, | ||
| const iceberg::SortOrder& sort_order, const std::string& location, | ||
| const std::unordered_map<std::string, std::string>& properties, int format_version) { | ||
| for (const auto& [key, _] : properties) { | ||
| if (TableProperties::reserved_properties().contains(key)) { | ||
| return InvalidArgument( | ||
| "Table properties should not contain reserved properties, but got {}", key); | ||
| } | ||
| } | ||
|
|
||
| // Reassign all column ids to ensure consistency | ||
| std::atomic<int32_t> last_column_id = 0; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need to use atomic here? |
||
| auto next_id = [&last_column_id]() -> int32_t { return ++last_column_id; }; | ||
| ICEBERG_ASSIGN_OR_RAISE(auto fresh_schema, | ||
| AssignFreshIds(Schema::kInitialSchemaId, schema, next_id)); | ||
|
|
||
| // Rebuild the partition spec using the new column ids | ||
| std::atomic<int32_t> last_partition_field_id = PartitionSpec::kInvalidPartitionFieldId; | ||
| auto next_partition_field_id = [&last_partition_field_id]() -> int32_t { | ||
| return ++last_partition_field_id; | ||
| }; | ||
| ICEBERG_ASSIGN_OR_RAISE(auto fresh_spec, | ||
| FreshPartitionSpec(PartitionSpec::kInitialSpecId, spec, schema, | ||
| *fresh_schema, next_partition_field_id)); | ||
|
|
||
| // rebuild the sort order using the new column ids | ||
| int32_t fresh_order_id = | ||
| sort_order.is_unsorted() ? sort_order.order_id() : SortOrder::kInitialSortOrderId; | ||
| ICEBERG_ASSIGN_OR_RAISE(auto fresh_order, | ||
| FreshSortOrder(fresh_order_id, *fresh_schema, sort_order)) | ||
|
|
||
| // Validata the metrics configuration. | ||
| ICEBERG_RETURN_UNEXPECTED( | ||
| MetricsConfig::VerifyReferencedColumns(properties, *fresh_schema)); | ||
|
|
||
| PropertyUtil::ValidateCommitProperties(properties); | ||
|
|
||
| return TableMetadataBuilder::BuildFromEmpty(format_version) | ||
| ->SetLocation(location) | ||
| .SetCurrentSchema(std::move(fresh_schema), last_column_id.load()) | ||
| .SetDefaultPartitionSpec(std::move(fresh_spec)) | ||
| .SetDefaultSortOrder(std::move(fresh_order)) | ||
| .SetProperties(properties) | ||
| .Build(); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<Schema>> TableMetadata::Schema() const { | ||
| return SchemaById(current_schema_id); | ||
| } | ||
|
|
@@ -408,6 +518,10 @@ class TableMetadataBuilder::Impl { | |
| const TableMetadata* base() const { return base_; } | ||
| const TableMetadata& metadata() const { return metadata_; } | ||
|
|
||
| void SetLocation(std::string_view location) { | ||
| metadata_.location = std::string(location); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might need to follow the java impl to validate the input and then produce a update to the changes list. We might need to follow the Java impl to validate the input location and then produce a TableUpdate to the changes list.
```java
public Builder setLocation(String newLocation) {
if (location != null && location.equals(newLocation)) {
return this;
}
this.location = newLocation;
changes.add(new MetadataUpdate.SetLocation(newLocation));
return this;
} |
||
| } | ||
|
|
||
| void SetMetadataLocation(std::string_view metadata_location) { | ||
| metadata_location_ = std::string(metadata_location); | ||
| if (base_ != nullptr) { | ||
|
|
@@ -917,7 +1031,8 @@ TableMetadataBuilder& TableMetadataBuilder::RemoveProperties( | |
| } | ||
|
|
||
| TableMetadataBuilder& TableMetadataBuilder::SetLocation(std::string_view location) { | ||
| throw IcebergError(std::format("{} not implemented", __FUNCTION__)); | ||
| impl_->SetLocation(location); | ||
| return *this; | ||
| } | ||
|
|
||
| TableMetadataBuilder& TableMetadataBuilder::AddEncryptionKey( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -124,6 +124,12 @@ struct ICEBERG_EXPORT TableMetadata { | |
| /// A `long` higher than all assigned row IDs | ||
| int64_t next_row_id; | ||
|
|
||
| static Result<std::unique_ptr<TableMetadata>> Make( | ||
| const iceberg::Schema& schema, const iceberg::PartitionSpec& spec, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think we should rename functions below to |
||
| const iceberg::SortOrder& sort_order, const std::string& location, | ||
| const std::unordered_map<std::string, std::string>& properties, | ||
| int format_version = kDefaultTableFormatVersion); | ||
|
|
||
| /// \brief Get the current schema, return NotFoundError if not found | ||
| Result<std::shared_ptr<iceberg::Schema>> Schema() const; | ||
| /// \brief Get the current schema by ID, return NotFoundError if not found | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
formatter_internal.hhas aFormatRangeto implement this. Using it requires adding a table_identifiler.cc since we cannot export internal header here.