From 59a68710e17ab1a46a09d420c6874fa139ec67bb Mon Sep 17 00:00:00 2001 From: Brian Michell Date: Thu, 4 Dec 2025 14:46:38 -0600 Subject: [PATCH 01/14] Add open_as_void option to zarr v2 driver (#6) --- tensorstore/driver/zarr/driver.cc | 211 ++++++++++++++-- tensorstore/driver/zarr/driver_impl.h | 8 +- tensorstore/driver/zarr/driver_test.cc | 322 +++++++++++++++++++++++++ tensorstore/driver/zarr/schema.yml | 8 + tensorstore/driver/zarr/spec.cc | 22 +- tensorstore/driver/zarr/spec.h | 13 +- 6 files changed, 561 insertions(+), 23 deletions(-) diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index 69164648e..8a0943ae5 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -29,6 +29,10 @@ #include "absl/status/status.h" #include "absl/strings/cord.h" #include +#include "riegeli/bytes/cord_reader.h" +#include "riegeli/bytes/cord_writer.h" +#include "riegeli/bytes/read_all.h" +#include "riegeli/bytes/write.h" #include "tensorstore/array.h" #include "tensorstore/array_storage_statistics.h" #include "tensorstore/box.h" @@ -137,6 +141,20 @@ absl::Status ZarrDriverSpec::ApplyOptions(SpecOptions&& options) { } Result ZarrDriverSpec::GetSpecInfo() const { + // For open_as_void, we don't use normal field resolution + // Note: When opening an existing array, dtype may not be known yet, + // so we can't determine the exact rank until metadata is loaded. + if (open_as_void && partial_metadata.dtype) { + SpecRankAndFieldInfo info; + info.full_rank = schema.rank(); + info.chunked_rank = partial_metadata.rank; + // For void access, add one dimension for the bytes + info.field_rank = 1; // The bytes dimension + if (info.chunked_rank != dynamic_rank) { + info.full_rank = info.chunked_rank + 1; + } + return info; + } return GetSpecRankAndFieldInfo(partial_metadata, selected_field, schema); } @@ -171,6 +189,10 @@ TENSORSTORE_DEFINE_JSON_DEFAULT_BINDER( jb::Member("field", jb::Projection<&ZarrDriverSpec::selected_field>( jb::DefaultValue( [](auto* obj) { *obj = std::string{}; }))), + jb::Member("open_as_void", + jb::Projection<&ZarrDriverSpec::open_as_void>( + jb::DefaultValue( + [](auto* v) { *v = false; }))), jb::Initialize([](auto* obj) { TENSORSTORE_ASSIGN_OR_RETURN(auto info, obj->GetSpecInfo()); if (info.full_rank != dynamic_rank) { @@ -210,8 +232,19 @@ Result> ZarrDriverSpec::GetFillValue( const auto& metadata = partial_metadata; if (metadata.dtype && metadata.fill_value) { TENSORSTORE_ASSIGN_OR_RETURN( - size_t field_index, GetFieldIndex(*metadata.dtype, selected_field)); - fill_value = (*metadata.fill_value)[field_index]; + size_t field_index, + GetFieldIndex(*metadata.dtype, selected_field, open_as_void)); + + // For void access, synthesize a byte-level fill value + if (field_index == kVoidFieldIndex) { + const Index nbytes = metadata.dtype->bytes_per_outer_element; + auto byte_arr = AllocateArray( + span({nbytes}), c_order, value_init, + dtype_v); + fill_value = byte_arr; + } else { + fill_value = (*metadata.fill_value)[field_index]; + } } if (!fill_value.valid() || !transform.valid()) { @@ -238,13 +271,15 @@ Result> ZarrDriverSpec::GetFillValue( DataCache::DataCache(Initializer&& initializer, std::string key_prefix, DimensionSeparator dimension_separator, - std::string metadata_key) + std::string metadata_key, bool open_as_void) : Base(std::move(initializer), GetChunkGridSpecification( - *static_cast(initializer.metadata.get()))), + *static_cast(initializer.metadata.get()), + open_as_void)), key_prefix_(std::move(key_prefix)), dimension_separator_(dimension_separator), - metadata_key_(std::move(metadata_key)) {} + metadata_key_(std::move(metadata_key)), + open_as_void_(open_as_void) {} absl::Status DataCache::ValidateMetadataCompatibility( const void* existing_metadata_ptr, const void* new_metadata_ptr) { @@ -268,12 +303,40 @@ void DataCache::GetChunkGridBounds(const void* metadata_ptr, DimensionSet& implicit_lower_bounds, DimensionSet& implicit_upper_bounds) { const auto& metadata = *static_cast(metadata_ptr); - assert(bounds.rank() == static_cast(metadata.shape.size())); - std::fill(bounds.origin().begin(), bounds.origin().end(), Index(0)); + // Use >= assertion like zarr3 to allow for extra dimensions + assert(bounds.rank() >= static_cast(metadata.shape.size())); + std::fill(bounds.origin().begin(), + bounds.origin().begin() + metadata.shape.size(), Index(0)); std::copy(metadata.shape.begin(), metadata.shape.end(), bounds.shape().begin()); implicit_lower_bounds = false; - implicit_upper_bounds = true; + implicit_upper_bounds = false; + for (DimensionIndex i = 0; + i < static_cast(metadata.shape.size()); ++i) { + implicit_upper_bounds[i] = true; + } + // Handle extra dimensions for void access or field shapes + if (bounds.rank() > static_cast(metadata.shape.size())) { + if (open_as_void_) { + // For void access, the extra dimension is the bytes_per_outer_element + if (static_cast(metadata.shape.size() + 1) == + bounds.rank()) { + bounds.shape()[metadata.rank] = metadata.dtype.bytes_per_outer_element; + bounds.origin()[metadata.rank] = 0; + } + } else if (metadata.dtype.fields.size() == 1) { + // Handle single field with field_shape (like zarr3) + const auto& field = metadata.dtype.fields[0]; + if (static_cast(metadata.shape.size() + + field.field_shape.size()) == + bounds.rank()) { + for (size_t i = 0; i < field.field_shape.size(); ++i) { + bounds.shape()[metadata.shape.size() + i] = field.field_shape[i]; + bounds.origin()[metadata.shape.size() + i] = 0; + } + } + } + } } Result> DataCache::GetResizedMetadata( @@ -294,13 +357,61 @@ Result> DataCache::GetResizedMetadata( } internal::ChunkGridSpecification DataCache::GetChunkGridSpecification( - const ZarrMetadata& metadata) { + const ZarrMetadata& metadata, bool open_as_void) { internal::ChunkGridSpecification::ComponentList components; - components.reserve(metadata.dtype.fields.size()); std::vector chunked_to_cell_dimensions( metadata.chunks.size()); std::iota(chunked_to_cell_dimensions.begin(), chunked_to_cell_dimensions.end(), static_cast(0)); + + // Special case: void access - create single component for raw bytes + if (open_as_void) { + const Index bytes_per_element = metadata.dtype.bytes_per_outer_element; + + // Create a zero-filled byte array as the fill value + auto base_fill_value = AllocateArray( + span({bytes_per_element}), c_order, value_init, + dtype_v); + + // The full chunk shape includes the extra bytes dimension + std::vector chunk_shape_with_bytes = metadata.chunks; + chunk_shape_with_bytes.push_back(bytes_per_element); + + const DimensionIndex cell_rank = metadata.rank + 1; + + // Broadcast fill value to target shape [unbounded, ..., bytes_per_element] + // like zarr3 does + std::vector target_shape(metadata.rank, kInfIndex); + target_shape.push_back(bytes_per_element); + auto chunk_fill_value = + BroadcastArray(base_fill_value, BoxView<>(target_shape)).value(); + + // Create valid data bounds - unbounded for chunked dimensions, + // explicit for bytes dimension + Box<> valid_data_bounds(cell_rank); + for (DimensionIndex i = 0; i < metadata.rank; ++i) { + valid_data_bounds[i] = IndexInterval::Infinite(); + } + valid_data_bounds[metadata.rank] = + IndexInterval::UncheckedSized(0, bytes_per_element); + + // Create permutation: copy existing order and add the bytes dimension + DimensionIndex layout_order_buffer[kMaxRank]; + GetChunkInnerOrder(metadata.rank, metadata.order, + span(layout_order_buffer, metadata.rank)); + layout_order_buffer[metadata.rank] = metadata.rank; // Add bytes dimension + + components.emplace_back( + internal::AsyncWriteArray::Spec{ + std::move(chunk_fill_value), std::move(valid_data_bounds), + ContiguousLayoutPermutation<>(span(layout_order_buffer, cell_rank))}, + std::move(chunk_shape_with_bytes), chunked_to_cell_dimensions); + + return internal::ChunkGridSpecification{std::move(components)}; + } + + // Normal field-based access + components.reserve(metadata.dtype.fields.size()); for (size_t field_i = 0; field_i < metadata.dtype.fields.size(); ++field_i) { const auto& field = metadata.dtype.fields[field_i]; const auto& field_layout = metadata.chunk_layout.fields[field_i]; @@ -335,12 +446,70 @@ internal::ChunkGridSpecification DataCache::GetChunkGridSpecification( Result, 1>> DataCache::DecodeChunk( span chunk_indices, absl::Cord data) { + if (open_as_void_) { + // For void access, return raw bytes as a single component + const auto& md = metadata(); + + // Decompress the data first (if compressed) + absl::Cord decompressed = std::move(data); + if (md.compressor) { + riegeli::CordReader base_reader(std::move(decompressed)); + auto compressed_reader = md.compressor->GetReader( + base_reader, md.dtype.bytes_per_outer_element); + absl::Cord uncompressed; + TENSORSTORE_RETURN_IF_ERROR( + riegeli::ReadAll(std::move(compressed_reader), uncompressed)); + if (!base_reader.VerifyEndAndClose()) return base_reader.status(); + decompressed = std::move(uncompressed); + } + + // Build the shape: chunk_shape + bytes_per_element + std::vector shape = md.chunks; + shape.push_back(md.dtype.bytes_per_outer_element); + + // Create a byte array from the decompressed data + auto flat_data = decompressed.Flatten(); + auto byte_array = AllocateArray(shape, c_order, default_init, + dtype_v); + std::memcpy(byte_array.data(), flat_data.data(), + std::min(static_cast(byte_array.num_elements()), + flat_data.size())); + + absl::InlinedVector, 1> result; + result.push_back(std::move(byte_array)); + return result; + } return internal_zarr::DecodeChunk(metadata(), std::move(data)); } Result DataCache::EncodeChunk( span chunk_indices, span> component_arrays) { + if (open_as_void_) { + // For void access, encode raw bytes directly + const auto& md = metadata(); + if (component_arrays.size() != 1) { + return absl::InvalidArgumentError( + "Expected exactly one component array for void access"); + } + const auto& byte_array = component_arrays[0]; + absl::Cord uncompressed( + std::string_view(static_cast(byte_array.data()), + byte_array.num_elements())); + + // Compress if needed + if (md.compressor) { + absl::Cord encoded; + riegeli::CordWriter base_writer(&encoded); + auto writer = md.compressor->GetWriter( + base_writer, md.dtype.bytes_per_outer_element); + TENSORSTORE_RETURN_IF_ERROR( + riegeli::Write(std::move(uncompressed), std::move(writer))); + if (!base_writer.Close()) return base_writer.status(); + return encoded; + } + return uncompressed; + } return internal_zarr::EncodeChunk(metadata(), component_arrays); } @@ -356,6 +525,7 @@ absl::Status DataCache::GetBoundSpecData( const auto& metadata = *static_cast(metadata_ptr); spec.selected_field = EncodeSelectedField(component_index, metadata.dtype); spec.metadata_key = metadata_key_; + spec.open_as_void = open_as_void_; auto& pm = spec.partial_metadata; pm.rank = metadata.rank; pm.zarr_format = metadata.zarr_format; @@ -416,6 +586,10 @@ Result ZarrDriverSpec::ToUrl() const { return absl::InvalidArgumentError( "zarr2 URL syntax not supported with selected_field specified"); } + if (open_as_void) { + return absl::InvalidArgumentError( + "zarr2 URL syntax not supported with open_as_void specified"); + } TENSORSTORE_ASSIGN_OR_RETURN(auto base_url, store.ToUrl()); return tensorstore::StrCat(base_url, "|", kUrlScheme, ":"); } @@ -483,7 +657,8 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { TENSORSTORE_ASSIGN_OR_RETURN( auto metadata, internal_zarr::GetNewMetadata(spec().partial_metadata, - spec().selected_field, spec().schema), + spec().selected_field, spec().schema, + spec().open_as_void), tensorstore::MaybeAnnotateStatus( _, "Cannot create using specified \"metadata\" and schema")); return metadata; @@ -496,7 +671,8 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { internal::EncodeCacheKey( &result, spec.store.path, GetDimensionSeparator(spec.partial_metadata, zarr_metadata), - zarr_metadata, spec.metadata_key); + zarr_metadata, spec.metadata_key, + spec.open_as_void ? "void" : "normal"); return result; } @@ -507,7 +683,7 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { return std::make_unique( std::move(initializer), spec().store.path, GetDimensionSeparator(spec().partial_metadata, metadata), - spec().metadata_key); + spec().metadata_key, spec().open_as_void); } Result GetComponentIndex(const void* metadata_ptr, @@ -516,7 +692,14 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { TENSORSTORE_RETURN_IF_ERROR( ValidateMetadata(metadata, spec().partial_metadata)); TENSORSTORE_ASSIGN_OR_RETURN( - auto field_index, GetFieldIndex(metadata.dtype, spec().selected_field)); + auto field_index, + GetFieldIndex(metadata.dtype, spec().selected_field, + spec().open_as_void)); + // For void access, map to component index 0 since we create a special + // component for raw byte access + if (field_index == kVoidFieldIndex) { + field_index = 0; + } TENSORSTORE_RETURN_IF_ERROR( ValidateMetadataSchema(metadata, field_index, spec().schema)); return field_index; diff --git a/tensorstore/driver/zarr/driver_impl.h b/tensorstore/driver/zarr/driver_impl.h index df3c3930f..c2933dd90 100644 --- a/tensorstore/driver/zarr/driver_impl.h +++ b/tensorstore/driver/zarr/driver_impl.h @@ -63,10 +63,11 @@ class ZarrDriverSpec ZarrPartialMetadata partial_metadata; SelectedField selected_field; std::string metadata_key; + bool open_as_void = false; constexpr static auto ApplyMembers = [](auto& x, auto f) { return f(internal::BaseCast(x), x.partial_metadata, - x.selected_field, x.metadata_key); + x.selected_field, x.metadata_key, x.open_as_void); }; absl::Status ApplyOptions(SpecOptions&& options) override; @@ -98,7 +99,7 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache { public: explicit DataCache(Initializer&& initializer, std::string key_prefix, DimensionSeparator dimension_separator, - std::string metadata_key); + std::string metadata_key, bool open_as_void = false); const ZarrMetadata& metadata() { return *static_cast(initial_metadata().get()); @@ -117,7 +118,7 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache { /// Returns the ChunkCache grid to use for the given metadata. static internal::ChunkGridSpecification GetChunkGridSpecification( - const ZarrMetadata& metadata); + const ZarrMetadata& metadata, bool open_as_void = false); Result, 1>> DecodeChunk( span chunk_indices, absl::Cord data) override; @@ -140,6 +141,7 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache { std::string key_prefix_; DimensionSeparator dimension_separator_; std::string metadata_key_; + bool open_as_void_; }; class ZarrDriver; diff --git a/tensorstore/driver/zarr/driver_test.cc b/tensorstore/driver/zarr/driver_test.cc index 92c5be48a..a5014987d 100644 --- a/tensorstore/driver/zarr/driver_test.cc +++ b/tensorstore/driver/zarr/driver_test.cc @@ -3499,4 +3499,326 @@ TEST(DriverTest, UrlSchemeRoundtrip) { {"kvstore", {{"driver", "memory"}, {"path", "abc.zarr/def/"}}}}); } +// Tests for open_as_void functionality + +TEST(ZarrDriverTest, OpenAsVoidSimpleType) { + // Test open_as_void with a simple data type (int16) + auto context = Context::Default(); + + // First create a normal array + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", "({{1, 2}, {3, 4}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Now open with open_as_void=true + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be the size of the data type (2 bytes for int16) + EXPECT_EQ(2, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); +} + +TEST(ZarrDriverTest, OpenAsVoidStructuredType) { + // Test open_as_void with a structured data type + auto context = Context::Default(); + + // Create an array with a structured dtype + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"field", "y"}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", ::nlohmann::json::array_t{{"x", "|u1"}, {"y", "({{100, 200}, {300, 400}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Now open with open_as_void=true - this should give raw access to the entire + // struct + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be 3 bytes (1 byte for u1 + 2 bytes for i2) + EXPECT_EQ(3, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); +} + +TEST(ZarrDriverTest, OpenAsVoidWithCompression) { + // Test open_as_void with compression enabled + auto context = Context::Default(); + + // Create an array with blosc compression + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"compressor", {{"id", "blosc"}}}, + {"dtype", "({{0x01020304, 0x05060708}, + {0x090a0b0c, 0x0d0e0f10}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Now open with open_as_void=true + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be 4 bytes for int32 + EXPECT_EQ(4, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); + + // Read the raw bytes and verify decompression works + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto read_result, + tensorstore::Read(void_store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + EXPECT_EQ(read_result.shape()[0], 2); + EXPECT_EQ(read_result.shape()[1], 2); + EXPECT_EQ(read_result.shape()[2], 4); +} + +TEST(ZarrDriverTest, OpenAsVoidSpecRoundtrip) { + // Test that open_as_void is properly preserved in spec round-trips + ::nlohmann::json json_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", ", + void_store.dtype()); +} + +TEST(ZarrDriverTest, OpenAsVoidUrlNotSupported) { + // Test that open_as_void is not supported with URL syntax + ::nlohmann::json json_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"dtype", "({{0x0102, 0x0304}, + {0x0506, 0x0708}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(data, store).result()); + + // Open as void and read + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Read the raw bytes + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto bytes_read, + tensorstore::Read(void_store).result()); + + // Verify shape: [2, 2, 2] where last dim is 2 bytes per uint16 + EXPECT_EQ(bytes_read.shape()[0], 2); + EXPECT_EQ(bytes_read.shape()[1], 2); + EXPECT_EQ(bytes_read.shape()[2], 2); + + // Verify the raw bytes (little endian) + auto bytes_ptr = static_cast(bytes_read.data()); + // First element: 0x0102 -> bytes 0x02, 0x01 (little endian) + EXPECT_EQ(bytes_ptr[0], 0x02); + EXPECT_EQ(bytes_ptr[1], 0x01); +} + } // namespace diff --git a/tensorstore/driver/zarr/schema.yml b/tensorstore/driver/zarr/schema.yml index 45711648c..a90fb7e3a 100644 --- a/tensorstore/driver/zarr/schema.yml +++ b/tensorstore/driver/zarr/schema.yml @@ -17,6 +17,14 @@ allOf: Must be specified if the `.metadata.dtype` specified in the array metadata has more than one field. default: null + open_as_void: + type: boolean + default: false + title: Raw byte access mode. + description: | + When true, opens the array as raw bytes instead of interpreting it + as structured data. The resulting array will have an additional + dimension representing the byte layout of each element. metadata: title: Zarr array metadata. description: | diff --git a/tensorstore/driver/zarr/spec.cc b/tensorstore/driver/zarr/spec.cc index 34a2825f9..4857d045b 100644 --- a/tensorstore/driver/zarr/spec.cc +++ b/tensorstore/driver/zarr/spec.cc @@ -151,7 +151,8 @@ absl::Status ValidateMetadata(const ZarrMetadata& metadata, Result GetNewMetadata( const ZarrPartialMetadata& partial_metadata, - const SelectedField& selected_field, const Schema& schema) { + const SelectedField& selected_field, const Schema& schema, + bool open_as_void) { ZarrMetadataPtr metadata = std::make_shared(); metadata->zarr_format = partial_metadata.zarr_format.value_or(2); metadata->dimension_separator = partial_metadata.dimension_separator.value_or( @@ -172,7 +173,12 @@ Result GetNewMetadata( // multi-field zarr dtype is desired, it must be specified explicitly. TENSORSTORE_ASSIGN_OR_RETURN( selected_field_index, - GetFieldIndex(*partial_metadata.dtype, selected_field)); + GetFieldIndex(*partial_metadata.dtype, selected_field, open_as_void)); + // For void access, use field 0 for metadata creation since we use all + // fields as raw bytes + if (selected_field_index == kVoidFieldIndex) { + selected_field_index = 0; + } metadata->dtype = *partial_metadata.dtype; } else { if (!selected_field.empty()) { @@ -527,7 +533,17 @@ std::string GetFieldNames(const ZarrDType& dtype) { } // namespace Result GetFieldIndex(const ZarrDType& dtype, - const SelectedField& selected_field) { + const SelectedField& selected_field, + bool open_as_void) { + // Special case: open_as_void requests raw byte access (works for any dtype) + if (open_as_void) { + if (dtype.fields.empty()) { + return absl::FailedPreconditionError( + "Requested void access but dtype has no fields"); + } + return kVoidFieldIndex; + } + if (selected_field.empty()) { if (dtype.fields.size() != 1) { return absl::FailedPreconditionError(tensorstore::StrCat( diff --git a/tensorstore/driver/zarr/spec.h b/tensorstore/driver/zarr/spec.h index 0ef3ab9d3..597fc32f0 100644 --- a/tensorstore/driver/zarr/spec.h +++ b/tensorstore/driver/zarr/spec.h @@ -70,9 +70,11 @@ using SelectedField = std::string; /// \param partial_metadata Constraints in the form of partial zarr metadata. /// \param selected_field The field to which `schema` applies. /// \param schema Schema constraints for the `selected_field`. +/// \param open_as_void If true, opens the array as raw bytes. Result GetNewMetadata( const ZarrPartialMetadata& partial_metadata, - const SelectedField& selected_field, const Schema& schema); + const SelectedField& selected_field, const Schema& schema, + bool open_as_void = false); struct SpecRankAndFieldInfo { /// Full rank of the TensorStore, if known. Equal to the chunked rank plus @@ -134,11 +136,16 @@ Result ParseSelectedField(const ::nlohmann::json& value); /// \param dtype The parsed zarr "dtype" specification. /// \param selected_field The label of the field, or an empty string to indicate /// that the zarr array must have only a single field. -/// \returns The field index. +/// \param open_as_void If true, returns kVoidFieldIndex for raw byte access. +/// \returns The field index, or kVoidFieldIndex if open_as_void is true. /// \error `absl::StatusCode::kFailedPrecondition` if `selected_field` is not /// valid. Result GetFieldIndex(const ZarrDType& dtype, - const SelectedField& selected_field); + const SelectedField& selected_field, + bool open_as_void = false); + +/// Special field index indicating void (raw byte) access. +constexpr size_t kVoidFieldIndex = size_t(-1); /// Encodes a field index as a `SelectedField` JSON specification. /// From 2aedabf4cf053a8c7d2246cb5f336328900fea56 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 5 Jan 2026 17:29:15 +0000 Subject: [PATCH 02/14] Remove default `open_as_void` from definitions --- tensorstore/driver/zarr/driver.cc | 8 +++----- tensorstore/driver/zarr/driver_impl.h | 4 ++-- tensorstore/driver/zarr/spec.h | 4 ++-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index 8a0943ae5..0eecd5b96 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -271,15 +271,13 @@ Result> ZarrDriverSpec::GetFillValue( DataCache::DataCache(Initializer&& initializer, std::string key_prefix, DimensionSeparator dimension_separator, - std::string metadata_key, bool open_as_void) + std::string metadata_key) : Base(std::move(initializer), GetChunkGridSpecification( - *static_cast(initializer.metadata.get()), - open_as_void)), + *static_cast(initializer.metadata.get()))), key_prefix_(std::move(key_prefix)), dimension_separator_(dimension_separator), - metadata_key_(std::move(metadata_key)), - open_as_void_(open_as_void) {} + metadata_key_(std::move(metadata_key)) {} absl::Status DataCache::ValidateMetadataCompatibility( const void* existing_metadata_ptr, const void* new_metadata_ptr) { diff --git a/tensorstore/driver/zarr/driver_impl.h b/tensorstore/driver/zarr/driver_impl.h index c2933dd90..0871084b9 100644 --- a/tensorstore/driver/zarr/driver_impl.h +++ b/tensorstore/driver/zarr/driver_impl.h @@ -99,7 +99,7 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache { public: explicit DataCache(Initializer&& initializer, std::string key_prefix, DimensionSeparator dimension_separator, - std::string metadata_key, bool open_as_void = false); + std::string metadata_key); const ZarrMetadata& metadata() { return *static_cast(initial_metadata().get()); @@ -118,7 +118,7 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache { /// Returns the ChunkCache grid to use for the given metadata. static internal::ChunkGridSpecification GetChunkGridSpecification( - const ZarrMetadata& metadata, bool open_as_void = false); + const ZarrMetadata& metadata); Result, 1>> DecodeChunk( span chunk_indices, absl::Cord data) override; diff --git a/tensorstore/driver/zarr/spec.h b/tensorstore/driver/zarr/spec.h index 597fc32f0..bb4659023 100644 --- a/tensorstore/driver/zarr/spec.h +++ b/tensorstore/driver/zarr/spec.h @@ -74,7 +74,7 @@ using SelectedField = std::string; Result GetNewMetadata( const ZarrPartialMetadata& partial_metadata, const SelectedField& selected_field, const Schema& schema, - bool open_as_void = false); + bool open_as_void); struct SpecRankAndFieldInfo { /// Full rank of the TensorStore, if known. Equal to the chunked rank plus @@ -142,7 +142,7 @@ Result ParseSelectedField(const ::nlohmann::json& value); /// valid. Result GetFieldIndex(const ZarrDType& dtype, const SelectedField& selected_field, - bool open_as_void = false); + bool open_as_void); /// Special field index indicating void (raw byte) access. constexpr size_t kVoidFieldIndex = size_t(-1); From 46d99028605171205d256b829245a793042b22ae Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 5 Jan 2026 17:29:51 +0000 Subject: [PATCH 03/14] Use derived `DataCache` for `open_as_void` --- tensorstore/driver/zarr/driver.cc | 328 +++++++++++++++----------- tensorstore/driver/zarr/driver_impl.h | 31 ++- tensorstore/driver/zarr/spec.cc | 3 +- 3 files changed, 218 insertions(+), 144 deletions(-) diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index 0eecd5b96..892caf7d8 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -301,40 +301,12 @@ void DataCache::GetChunkGridBounds(const void* metadata_ptr, DimensionSet& implicit_lower_bounds, DimensionSet& implicit_upper_bounds) { const auto& metadata = *static_cast(metadata_ptr); - // Use >= assertion like zarr3 to allow for extra dimensions - assert(bounds.rank() >= static_cast(metadata.shape.size())); - std::fill(bounds.origin().begin(), - bounds.origin().begin() + metadata.shape.size(), Index(0)); + assert(bounds.rank() == static_cast(metadata.shape.size())); + std::fill(bounds.origin().begin(), bounds.origin().end(), Index(0)); std::copy(metadata.shape.begin(), metadata.shape.end(), bounds.shape().begin()); implicit_lower_bounds = false; - implicit_upper_bounds = false; - for (DimensionIndex i = 0; - i < static_cast(metadata.shape.size()); ++i) { - implicit_upper_bounds[i] = true; - } - // Handle extra dimensions for void access or field shapes - if (bounds.rank() > static_cast(metadata.shape.size())) { - if (open_as_void_) { - // For void access, the extra dimension is the bytes_per_outer_element - if (static_cast(metadata.shape.size() + 1) == - bounds.rank()) { - bounds.shape()[metadata.rank] = metadata.dtype.bytes_per_outer_element; - bounds.origin()[metadata.rank] = 0; - } - } else if (metadata.dtype.fields.size() == 1) { - // Handle single field with field_shape (like zarr3) - const auto& field = metadata.dtype.fields[0]; - if (static_cast(metadata.shape.size() + - field.field_shape.size()) == - bounds.rank()) { - for (size_t i = 0; i < field.field_shape.size(); ++i) { - bounds.shape()[metadata.shape.size() + i] = field.field_shape[i]; - bounds.origin()[metadata.shape.size() + i] = 0; - } - } - } - } + implicit_upper_bounds = true; } Result> DataCache::GetResizedMetadata( @@ -355,61 +327,13 @@ Result> DataCache::GetResizedMetadata( } internal::ChunkGridSpecification DataCache::GetChunkGridSpecification( - const ZarrMetadata& metadata, bool open_as_void) { + const ZarrMetadata& metadata) { internal::ChunkGridSpecification::ComponentList components; + components.reserve(metadata.dtype.fields.size()); std::vector chunked_to_cell_dimensions( metadata.chunks.size()); std::iota(chunked_to_cell_dimensions.begin(), chunked_to_cell_dimensions.end(), static_cast(0)); - - // Special case: void access - create single component for raw bytes - if (open_as_void) { - const Index bytes_per_element = metadata.dtype.bytes_per_outer_element; - - // Create a zero-filled byte array as the fill value - auto base_fill_value = AllocateArray( - span({bytes_per_element}), c_order, value_init, - dtype_v); - - // The full chunk shape includes the extra bytes dimension - std::vector chunk_shape_with_bytes = metadata.chunks; - chunk_shape_with_bytes.push_back(bytes_per_element); - - const DimensionIndex cell_rank = metadata.rank + 1; - - // Broadcast fill value to target shape [unbounded, ..., bytes_per_element] - // like zarr3 does - std::vector target_shape(metadata.rank, kInfIndex); - target_shape.push_back(bytes_per_element); - auto chunk_fill_value = - BroadcastArray(base_fill_value, BoxView<>(target_shape)).value(); - - // Create valid data bounds - unbounded for chunked dimensions, - // explicit for bytes dimension - Box<> valid_data_bounds(cell_rank); - for (DimensionIndex i = 0; i < metadata.rank; ++i) { - valid_data_bounds[i] = IndexInterval::Infinite(); - } - valid_data_bounds[metadata.rank] = - IndexInterval::UncheckedSized(0, bytes_per_element); - - // Create permutation: copy existing order and add the bytes dimension - DimensionIndex layout_order_buffer[kMaxRank]; - GetChunkInnerOrder(metadata.rank, metadata.order, - span(layout_order_buffer, metadata.rank)); - layout_order_buffer[metadata.rank] = metadata.rank; // Add bytes dimension - - components.emplace_back( - internal::AsyncWriteArray::Spec{ - std::move(chunk_fill_value), std::move(valid_data_bounds), - ContiguousLayoutPermutation<>(span(layout_order_buffer, cell_rank))}, - std::move(chunk_shape_with_bytes), chunked_to_cell_dimensions); - - return internal::ChunkGridSpecification{std::move(components)}; - } - - // Normal field-based access - components.reserve(metadata.dtype.fields.size()); for (size_t field_i = 0; field_i < metadata.dtype.fields.size(); ++field_i) { const auto& field = metadata.dtype.fields[field_i]; const auto& field_layout = metadata.chunk_layout.fields[field_i]; @@ -444,70 +368,12 @@ internal::ChunkGridSpecification DataCache::GetChunkGridSpecification( Result, 1>> DataCache::DecodeChunk( span chunk_indices, absl::Cord data) { - if (open_as_void_) { - // For void access, return raw bytes as a single component - const auto& md = metadata(); - - // Decompress the data first (if compressed) - absl::Cord decompressed = std::move(data); - if (md.compressor) { - riegeli::CordReader base_reader(std::move(decompressed)); - auto compressed_reader = md.compressor->GetReader( - base_reader, md.dtype.bytes_per_outer_element); - absl::Cord uncompressed; - TENSORSTORE_RETURN_IF_ERROR( - riegeli::ReadAll(std::move(compressed_reader), uncompressed)); - if (!base_reader.VerifyEndAndClose()) return base_reader.status(); - decompressed = std::move(uncompressed); - } - - // Build the shape: chunk_shape + bytes_per_element - std::vector shape = md.chunks; - shape.push_back(md.dtype.bytes_per_outer_element); - - // Create a byte array from the decompressed data - auto flat_data = decompressed.Flatten(); - auto byte_array = AllocateArray(shape, c_order, default_init, - dtype_v); - std::memcpy(byte_array.data(), flat_data.data(), - std::min(static_cast(byte_array.num_elements()), - flat_data.size())); - - absl::InlinedVector, 1> result; - result.push_back(std::move(byte_array)); - return result; - } return internal_zarr::DecodeChunk(metadata(), std::move(data)); } Result DataCache::EncodeChunk( span chunk_indices, span> component_arrays) { - if (open_as_void_) { - // For void access, encode raw bytes directly - const auto& md = metadata(); - if (component_arrays.size() != 1) { - return absl::InvalidArgumentError( - "Expected exactly one component array for void access"); - } - const auto& byte_array = component_arrays[0]; - absl::Cord uncompressed( - std::string_view(static_cast(byte_array.data()), - byte_array.num_elements())); - - // Compress if needed - if (md.compressor) { - absl::Cord encoded; - riegeli::CordWriter base_writer(&encoded); - auto writer = md.compressor->GetWriter( - base_writer, md.dtype.bytes_per_outer_element); - TENSORSTORE_RETURN_IF_ERROR( - riegeli::Write(std::move(uncompressed), std::move(writer))); - if (!base_writer.Close()) return base_writer.status(); - return encoded; - } - return uncompressed; - } return internal_zarr::EncodeChunk(metadata(), component_arrays); } @@ -523,7 +389,7 @@ absl::Status DataCache::GetBoundSpecData( const auto& metadata = *static_cast(metadata_ptr); spec.selected_field = EncodeSelectedField(component_index, metadata.dtype); spec.metadata_key = metadata_key_; - spec.open_as_void = open_as_void_; + spec.open_as_void = false; auto& pm = spec.partial_metadata; pm.rank = metadata.rank; pm.zarr_format = metadata.zarr_format; @@ -550,6 +416,178 @@ Result DataCache::GetChunkLayoutFromMetadata( } std::string DataCache::GetBaseKvstorePath() { return key_prefix_; } + +// VoidDataCache implementation +VoidDataCache::VoidDataCache(Initializer&& initializer, std::string key_prefix, + DimensionSeparator dimension_separator, + std::string metadata_key) + : DataCache(std::move(initializer), std::move(key_prefix), + dimension_separator, std::move(metadata_key)) { + // Replace the grid with the void-specific grid specification + grid_ = GetVoidChunkGridSpecification(metadata()); +} + +void VoidDataCache::GetChunkGridBounds(const void* metadata_ptr, + MutableBoxView<> bounds, + DimensionSet& implicit_lower_bounds, + DimensionSet& implicit_upper_bounds) { + const auto& metadata = *static_cast(metadata_ptr); + // Use >= assertion to allow for extra dimensions (the bytes dimension) + assert(bounds.rank() >= static_cast(metadata.shape.size())); + std::fill(bounds.origin().begin(), + bounds.origin().begin() + metadata.shape.size(), Index(0)); + std::copy(metadata.shape.begin(), metadata.shape.end(), + bounds.shape().begin()); + implicit_lower_bounds = false; + implicit_upper_bounds = false; + for (DimensionIndex i = 0; + i < static_cast(metadata.shape.size()); ++i) { + implicit_upper_bounds[i] = true; + } + // For void access, the extra dimension is the bytes_per_outer_element + if (static_cast(metadata.shape.size() + 1) == bounds.rank()) { + bounds.shape()[metadata.rank] = metadata.dtype.bytes_per_outer_element; + bounds.origin()[metadata.rank] = 0; + } +} + +internal::ChunkGridSpecification VoidDataCache::GetVoidChunkGridSpecification( + const ZarrMetadata& metadata) { + internal::ChunkGridSpecification::ComponentList components; + std::vector chunked_to_cell_dimensions( + metadata.chunks.size()); + std::iota(chunked_to_cell_dimensions.begin(), + chunked_to_cell_dimensions.end(), static_cast(0)); + + const Index bytes_per_element = metadata.dtype.bytes_per_outer_element; + + // Create a zero-filled byte array as the fill value + auto base_fill_value = AllocateArray( + span({bytes_per_element}), c_order, value_init, + dtype_v); + + // The full chunk shape includes the extra bytes dimension + std::vector chunk_shape_with_bytes = metadata.chunks; + chunk_shape_with_bytes.push_back(bytes_per_element); + + const DimensionIndex cell_rank = metadata.rank + 1; + + // Broadcast fill value to target shape [unbounded, ..., bytes_per_element] + // like zarr3 does + std::vector target_shape(metadata.rank, kInfIndex); + target_shape.push_back(bytes_per_element); + auto chunk_fill_value = + BroadcastArray(base_fill_value, BoxView<>(target_shape)).value(); + + // Create valid data bounds - unbounded for chunked dimensions, + // explicit for bytes dimension + Box<> valid_data_bounds(cell_rank); + for (DimensionIndex i = 0; i < metadata.rank; ++i) { + valid_data_bounds[i] = IndexInterval::Infinite(); + } + valid_data_bounds[metadata.rank] = + IndexInterval::UncheckedSized(0, bytes_per_element); + + // Create permutation: copy existing order and add the bytes dimension + DimensionIndex layout_order_buffer[kMaxRank]; + GetChunkInnerOrder(metadata.rank, metadata.order, + span(layout_order_buffer, metadata.rank)); + layout_order_buffer[metadata.rank] = metadata.rank; // Add bytes dimension + + components.emplace_back( + internal::AsyncWriteArray::Spec{ + std::move(chunk_fill_value), std::move(valid_data_bounds), + ContiguousLayoutPermutation<>(span(layout_order_buffer, cell_rank))}, + std::move(chunk_shape_with_bytes), chunked_to_cell_dimensions); + + return internal::ChunkGridSpecification{std::move(components)}; +} + +Result, 1>> +VoidDataCache::DecodeChunk(span chunk_indices, absl::Cord data) { + // For void access, return raw bytes as a single component + const auto& md = metadata(); + + // Decompress the data first (if compressed) + absl::Cord decompressed = std::move(data); + if (md.compressor) { + riegeli::CordReader base_reader(std::move(decompressed)); + auto compressed_reader = md.compressor->GetReader( + base_reader, md.dtype.bytes_per_outer_element); + absl::Cord uncompressed; + TENSORSTORE_RETURN_IF_ERROR( + riegeli::ReadAll(std::move(compressed_reader), uncompressed)); + if (!base_reader.VerifyEndAndClose()) return base_reader.status(); + decompressed = std::move(uncompressed); + } + + // Build the shape: chunk_shape + bytes_per_element + std::vector shape = md.chunks; + shape.push_back(md.dtype.bytes_per_outer_element); + + // Create a byte array from the decompressed data + auto flat_data = decompressed.Flatten(); + auto byte_array = AllocateArray(shape, c_order, default_init, + dtype_v); + std::memcpy(byte_array.data(), flat_data.data(), + std::min(static_cast(byte_array.num_elements()), + flat_data.size())); + + absl::InlinedVector, 1> result; + result.push_back(std::move(byte_array)); + return result; +} + +Result VoidDataCache::EncodeChunk( + span chunk_indices, + span> component_arrays) { + // For void access, encode raw bytes directly + const auto& md = metadata(); + if (component_arrays.size() != 1) { + return absl::InvalidArgumentError( + "Expected exactly one component array for void access"); + } + const auto& byte_array = component_arrays[0]; + absl::Cord uncompressed( + std::string_view(static_cast(byte_array.data()), + byte_array.num_elements())); + + // Compress if needed + if (md.compressor) { + absl::Cord encoded; + riegeli::CordWriter base_writer(&encoded); + auto writer = md.compressor->GetWriter(base_writer, + md.dtype.bytes_per_outer_element); + TENSORSTORE_RETURN_IF_ERROR( + riegeli::Write(std::move(uncompressed), std::move(writer))); + if (!base_writer.Close()) return base_writer.status(); + return encoded; + } + return uncompressed; +} + +absl::Status VoidDataCache::GetBoundSpecData( + internal_kvs_backed_chunk_driver::KvsDriverSpec& spec_base, + const void* metadata_ptr, size_t component_index) { + auto& spec = static_cast(spec_base); + const auto& metadata = *static_cast(metadata_ptr); + spec.selected_field = EncodeSelectedField(component_index, metadata.dtype); + spec.metadata_key = metadata_key_; + spec.open_as_void = true; + auto& pm = spec.partial_metadata; + pm.rank = metadata.rank; + pm.zarr_format = metadata.zarr_format; + pm.shape = metadata.shape; + pm.chunks = metadata.chunks; + pm.compressor = metadata.compressor; + pm.filters = metadata.filters; + pm.order = metadata.order; + pm.dtype = metadata.dtype; + pm.fill_value = metadata.fill_value; + pm.dimension_separator = dimension_separator_; + return absl::OkStatus(); +} + Result ZarrDriver::GetCodec() { return internal_zarr::GetCodecSpecFromMetadata(metadata()); } @@ -623,7 +661,7 @@ Future ZarrDriver::GetStorageStatistics( /*chunk_shape=*/grid.chunk_shape, /*shape=*/metadata->shape, /*dimension_separator=*/ - GetDimensionSeparatorChar(cache->dimension_separator_), + GetDimensionSeparatorChar(cache->dimension_separator()), staleness_bound, request.options)); }), std::move(promise), std::move(metadata_future)); @@ -678,10 +716,16 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { DataCache::Initializer&& initializer) override { const auto& metadata = *static_cast(initializer.metadata.get()); + if (spec().open_as_void) { + return std::make_unique( + std::move(initializer), spec().store.path, + GetDimensionSeparator(spec().partial_metadata, metadata), + spec().metadata_key); + } return std::make_unique( std::move(initializer), spec().store.path, GetDimensionSeparator(spec().partial_metadata, metadata), - spec().metadata_key, spec().open_as_void); + spec().metadata_key); } Result GetComponentIndex(const void* metadata_ptr, diff --git a/tensorstore/driver/zarr/driver_impl.h b/tensorstore/driver/zarr/driver_impl.h index 0871084b9..fe5e2c6f1 100644 --- a/tensorstore/driver/zarr/driver_impl.h +++ b/tensorstore/driver/zarr/driver_impl.h @@ -138,10 +138,39 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache { std::string GetBaseKvstorePath() override; + DimensionSeparator dimension_separator() const { return dimension_separator_; } + + protected: std::string key_prefix_; DimensionSeparator dimension_separator_; std::string metadata_key_; - bool open_as_void_; +}; + +/// Derived DataCache for open_as_void mode that provides raw byte access. +class VoidDataCache : public DataCache { + public: + explicit VoidDataCache(Initializer&& initializer, std::string key_prefix, + DimensionSeparator dimension_separator, + std::string metadata_key); + + void GetChunkGridBounds(const void* metadata_ptr, MutableBoxView<> bounds, + DimensionSet& implicit_lower_bounds, + DimensionSet& implicit_upper_bounds) override; + + /// Returns the ChunkCache grid for void (raw byte) access. + static internal::ChunkGridSpecification GetVoidChunkGridSpecification( + const ZarrMetadata& metadata); + + Result, 1>> DecodeChunk( + span chunk_indices, absl::Cord data) override; + + Result EncodeChunk( + span chunk_indices, + span> component_arrays) override; + + absl::Status GetBoundSpecData( + internal_kvs_backed_chunk_driver::KvsDriverSpec& spec_base, + const void* metadata_ptr, size_t component_index) override; }; class ZarrDriver; diff --git a/tensorstore/driver/zarr/spec.cc b/tensorstore/driver/zarr/spec.cc index 4857d045b..38a74ce30 100644 --- a/tensorstore/driver/zarr/spec.cc +++ b/tensorstore/driver/zarr/spec.cc @@ -346,7 +346,8 @@ Result GetSpecRankAndFieldInfo( if (metadata.dtype) { TENSORSTORE_ASSIGN_OR_RETURN( - size_t field_index, GetFieldIndex(*metadata.dtype, selected_field)); + size_t field_index, + GetFieldIndex(*metadata.dtype, selected_field, /*open_as_void=*/false)); info.field = &metadata.dtype->fields[field_index]; } From ccc4bd73caada118f6c72bdd0fb48e9ea8357694 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 6 Jan 2026 14:36:44 +0000 Subject: [PATCH 04/14] Fix compile issues for missing argument --- tensorstore/driver/zarr/spec_test.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorstore/driver/zarr/spec_test.cc b/tensorstore/driver/zarr/spec_test.cc index 6cd358dd5..471cf4a1f 100644 --- a/tensorstore/driver/zarr/spec_test.cc +++ b/tensorstore/driver/zarr/spec_test.cc @@ -187,12 +187,12 @@ TEST(ParseSelectedFieldTest, InvalidType) { } TEST(GetFieldIndexTest, Null) { - EXPECT_EQ(0u, GetFieldIndex(ParseDType(" GetNewMetadataFromOptions( ZarrPartialMetadata::FromJson(partial_metadata_json)); TENSORSTORE_ASSIGN_OR_RETURN( auto new_metadata, - GetNewMetadata(partial_metadata, selected_field, schema)); + GetNewMetadata(partial_metadata, selected_field, schema, false)); return new_metadata->ToJson(); } From 5d4a68f3b26d2d601898a0adde170c9a98d0d45f Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 6 Jan 2026 14:42:08 +0000 Subject: [PATCH 05/14] Correct tests, add argument comment for open as void value --- tensorstore/driver/zarr/spec_test.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorstore/driver/zarr/spec_test.cc b/tensorstore/driver/zarr/spec_test.cc index 471cf4a1f..81f105735 100644 --- a/tensorstore/driver/zarr/spec_test.cc +++ b/tensorstore/driver/zarr/spec_test.cc @@ -187,12 +187,12 @@ TEST(ParseSelectedFieldTest, InvalidType) { } TEST(GetFieldIndexTest, Null) { - EXPECT_EQ(0u, GetFieldIndex(ParseDType(" Date: Tue, 6 Jan 2026 15:18:08 +0000 Subject: [PATCH 06/14] Add test coverage for `GetSpecInfo` --- tensorstore/driver/zarr/driver_test.cc | 154 +++++++++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/tensorstore/driver/zarr/driver_test.cc b/tensorstore/driver/zarr/driver_test.cc index a5014987d..b3bb42f4e 100644 --- a/tensorstore/driver/zarr/driver_test.cc +++ b/tensorstore/driver/zarr/driver_test.cc @@ -3821,4 +3821,158 @@ TEST(ZarrDriverTest, OpenAsVoidReadWrite) { EXPECT_EQ(bytes_ptr[1], 0x01); } +// Tests for GetSpecInfo() with open_as_void + +TEST(ZarrDriverTest, GetSpecInfoOpenAsVoidWithKnownRank) { + // Test that GetSpecInfo correctly computes rank when open_as_void=true + // and dtype is specified with known chunked_rank. + // Expected: full_rank = chunked_rank + 1 (for bytes dimension) + ::nlohmann::json json_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"dtype", " Date: Tue, 6 Jan 2026 15:28:28 +0000 Subject: [PATCH 07/14] Resolve feedback `https://github.com/google/tensorstore/pull/272#discussion_r2663351949` and extend test coverage --- tensorstore/driver/zarr/driver.cc | 7 +- tensorstore/driver/zarr/driver_test.cc | 162 +++++++++++++++++++++++++ 2 files changed, 165 insertions(+), 4 deletions(-) diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index 892caf7d8..7e1c1a18b 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -59,6 +59,7 @@ #include "tensorstore/internal/chunk_grid_specification.h" #include "tensorstore/internal/grid_storage_statistics.h" #include "tensorstore/internal/intrusive_ptr.h" +#include "tensorstore/internal/riegeli/array_endian_codec.h" #include "tensorstore/internal/json_binding/bindable.h" #include "tensorstore/internal/json_binding/json_binding.h" #include "tensorstore/internal/uri_utils.h" @@ -547,10 +548,8 @@ Result VoidDataCache::EncodeChunk( return absl::InvalidArgumentError( "Expected exactly one component array for void access"); } - const auto& byte_array = component_arrays[0]; - absl::Cord uncompressed( - std::string_view(static_cast(byte_array.data()), - byte_array.num_elements())); + absl::Cord uncompressed = internal::MakeCordFromSharedPtr( + component_arrays[0].pointer(), component_arrays[0].num_elements()); // Compress if needed if (md.compressor) { diff --git a/tensorstore/driver/zarr/driver_test.cc b/tensorstore/driver/zarr/driver_test.cc index b3bb42f4e..0d1a70193 100644 --- a/tensorstore/driver/zarr/driver_test.cc +++ b/tensorstore/driver/zarr/driver_test.cc @@ -3821,6 +3821,168 @@ TEST(ZarrDriverTest, OpenAsVoidReadWrite) { EXPECT_EQ(bytes_ptr[1], 0x01); } +TEST(ZarrDriverTest, OpenAsVoidWriteRoundtrip) { + // Test that writing through open_as_void correctly encodes data + // and can be read back both through void access and normal typed access. + auto context = Context::Default(); + + // Create an array + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", "({{0, 0}, {0, 0}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(zeros, store).result()); + + // Open as void for writing + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Create raw bytes representing uint16 values in little endian: + // [0,0]: 0x1234 -> {0x34, 0x12}, [0,1]: 0x5678 -> {0x78, 0x56} + // [1,0]: 0x9ABC -> {0xBC, 0x9A}, [1,1]: 0xDEF0 -> {0xF0, 0xDE} + auto raw_bytes = tensorstore::AllocateArray( + {2, 2, 2}, tensorstore::c_order, tensorstore::value_init); + auto raw_bytes_ptr = static_cast( + const_cast(static_cast(raw_bytes.data()))); + // Element [0,0] = 0x1234 + raw_bytes_ptr[0] = 0x34; + raw_bytes_ptr[1] = 0x12; + // Element [0,1] = 0x5678 + raw_bytes_ptr[2] = 0x78; + raw_bytes_ptr[3] = 0x56; + // Element [1,0] = 0x9ABC + raw_bytes_ptr[4] = 0xBC; + raw_bytes_ptr[5] = 0x9A; + // Element [1,1] = 0xDEF0 + raw_bytes_ptr[6] = 0xF0; + raw_bytes_ptr[7] = 0xDE; + + // Write raw bytes through void access + TENSORSTORE_EXPECT_OK(tensorstore::Write(raw_bytes, void_store).result()); + + // Read back through void access and verify + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto bytes_read, + tensorstore::Read(void_store).result()); + auto bytes_read_ptr = static_cast(bytes_read.data()); + EXPECT_EQ(bytes_read_ptr[0], 0x34); + EXPECT_EQ(bytes_read_ptr[1], 0x12); + + // Read back through normal typed access and verify the values + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto typed_store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto typed_read, + tensorstore::Read(typed_store).result()); + auto typed_ptr = static_cast(typed_read.data()); + EXPECT_EQ(typed_ptr[0], 0x1234); + EXPECT_EQ(typed_ptr[1], 0x5678); + EXPECT_EQ(typed_ptr[2], 0x9ABC); + EXPECT_EQ(typed_ptr[3], 0xDEF0); +} + +TEST(ZarrDriverTest, OpenAsVoidWriteWithCompression) { + // Test writing through open_as_void with compression enabled. + // Verifies that the EncodeChunk method correctly compresses data. + auto context = Context::Default(); + + // Create an array with blosc compression + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"compressor", {{"id", "blosc"}}}, + {"dtype", "( + {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(zeros, store).result()); + + // Open as void for writing + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Create raw bytes representing int32 values in little endian + // Using a simple pattern: 0x01020304 at position [0,0] + auto raw_bytes = tensorstore::AllocateArray( + {4, 4, 4}, tensorstore::c_order, tensorstore::value_init); + + // Set first element to 0x01020304 (little endian: 04 03 02 01) + auto raw_bytes_ptr = static_cast( + const_cast(static_cast(raw_bytes.data()))); + raw_bytes_ptr[0] = 0x04; + raw_bytes_ptr[1] = 0x03; + raw_bytes_ptr[2] = 0x02; + raw_bytes_ptr[3] = 0x01; + + // Write raw bytes through void access (triggers compression) + TENSORSTORE_EXPECT_OK(tensorstore::Write(raw_bytes, void_store).result()); + + // Read back through normal typed access + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto typed_store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto typed_read, + tensorstore::Read(typed_store).result()); + auto typed_ptr = static_cast(typed_read.data()); + + // First element should be 0x01020304 + EXPECT_EQ(typed_ptr[0], 0x01020304); + // Rest should be zeros + EXPECT_EQ(typed_ptr[1], 0); +} + // Tests for GetSpecInfo() with open_as_void TEST(ZarrDriverTest, GetSpecInfoOpenAsVoidWithKnownRank) { From a42b6f511375dc1bd402ad525519f57210546735 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 6 Jan 2026 15:41:05 +0000 Subject: [PATCH 08/14] Resolve feedback `https://github.com/google/tensorstore/pull/272#discussion_r2663353574` and extend test coverage --- tensorstore/driver/zarr/driver.cc | 16 ++------- tensorstore/driver/zarr/driver_test.cc | 50 ++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index 7e1c1a18b..50c942c36 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -568,22 +568,10 @@ Result VoidDataCache::EncodeChunk( absl::Status VoidDataCache::GetBoundSpecData( internal_kvs_backed_chunk_driver::KvsDriverSpec& spec_base, const void* metadata_ptr, size_t component_index) { + TENSORSTORE_RETURN_IF_ERROR( + DataCache::GetBoundSpecData(spec_base, metadata_ptr, component_index)); auto& spec = static_cast(spec_base); - const auto& metadata = *static_cast(metadata_ptr); - spec.selected_field = EncodeSelectedField(component_index, metadata.dtype); - spec.metadata_key = metadata_key_; spec.open_as_void = true; - auto& pm = spec.partial_metadata; - pm.rank = metadata.rank; - pm.zarr_format = metadata.zarr_format; - pm.shape = metadata.shape; - pm.chunks = metadata.chunks; - pm.compressor = metadata.compressor; - pm.filters = metadata.filters; - pm.order = metadata.order; - pm.dtype = metadata.dtype; - pm.fill_value = metadata.fill_value; - pm.dimension_separator = dimension_separator_; return absl::OkStatus(); } diff --git a/tensorstore/driver/zarr/driver_test.cc b/tensorstore/driver/zarr/driver_test.cc index 0d1a70193..b6ebaa99c 100644 --- a/tensorstore/driver/zarr/driver_test.cc +++ b/tensorstore/driver/zarr/driver_test.cc @@ -3698,6 +3698,56 @@ TEST(ZarrDriverTest, OpenAsVoidSpecRoundtrip) { EXPECT_EQ(true, json_result.value("open_as_void", false)); } +TEST(ZarrDriverTest, OpenAsVoidGetBoundSpecData) { + // Test that open_as_void is correctly preserved when getting spec from an + // opened void store. This tests VoidDataCache::GetBoundSpecData. + auto context = Context::Default(); + + // First create a normal array + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", " Date: Wed, 7 Jan 2026 14:57:00 +0000 Subject: [PATCH 09/14] Resolve feedback `https://github.com/google/tensorstore/pull/272/changes/BASE..a42b6f511375dc1bd402ad525519f57210546735#r2666111665` Enforce schema validation for one-of `field` and `open_as_void` --- tensorstore/driver/zarr/driver.cc | 5 +++ tensorstore/driver/zarr/driver_test.cc | 45 ++++++++++---------------- tensorstore/driver/zarr/schema.yml | 39 ++++++++++++---------- 3 files changed, 44 insertions(+), 45 deletions(-) diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index 50c942c36..4151e69fe 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -195,6 +195,11 @@ TENSORSTORE_DEFINE_JSON_DEFAULT_BINDER( jb::DefaultValue( [](auto* v) { *v = false; }))), jb::Initialize([](auto* obj) { + // Validate that field and open_as_void are mutually exclusive + if (obj->open_as_void && !obj->selected_field.empty()) { + return absl::InvalidArgumentError( + "\"field\" and \"open_as_void\" are mutually exclusive"); + } TENSORSTORE_ASSIGN_OR_RETURN(auto info, obj->GetSpecInfo()); if (info.full_rank != dynamic_rank) { TENSORSTORE_RETURN_IF_ERROR( diff --git a/tensorstore/driver/zarr/driver_test.cc b/tensorstore/driver/zarr/driver_test.cc index b6ebaa99c..dc670e9a5 100644 --- a/tensorstore/driver/zarr/driver_test.cc +++ b/tensorstore/driver/zarr/driver_test.cc @@ -3749,11 +3749,9 @@ TEST(ZarrDriverTest, OpenAsVoidGetBoundSpecData) { } TEST(ZarrDriverTest, OpenAsVoidCannotUseWithField) { - // Test that specifying both open_as_void and field is handled appropriately - auto context = Context::Default(); - - // First create the array - ::nlohmann::json create_spec{ + // Test that specifying both open_as_void and field is rejected as they are + // mutually exclusive options. + ::nlohmann::json spec_with_both{ {"driver", "zarr"}, {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, {"metadata", @@ -3764,33 +3762,24 @@ TEST(ZarrDriverTest, OpenAsVoidCannotUseWithField) { {"chunks", {2, 2}}, }}, {"field", "x"}, - }; - - TENSORSTORE_ASSERT_OK_AND_ASSIGN( - auto store, - tensorstore::Open(create_spec, context, tensorstore::OpenMode::create, - tensorstore::ReadWriteMode::read_write) - .result()); - - // Using open_as_void takes precedence - it opens as raw bytes regardless of - // field selection. The field parameter should be ignored when open_as_void is - // true. - ::nlohmann::json void_spec{ - {"driver", "zarr"}, - {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, {"open_as_void", true}, }; - // This should succeed - open_as_void gives raw byte access - TENSORSTORE_ASSERT_OK_AND_ASSIGN( - auto void_store, - tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, - tensorstore::ReadWriteMode::read) - .result()); + // Specifying both field and open_as_void should fail + EXPECT_THAT( + tensorstore::Open(spec_with_both, tensorstore::OpenMode::create, + tensorstore::ReadWriteMode::read_write) + .result(), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("\"field\" and \"open_as_void\" are mutually " + "exclusive"))); - EXPECT_EQ(3, void_store.rank()); - EXPECT_EQ(tensorstore::dtype_v, - void_store.dtype()); + // Also test that Spec::FromJson rejects this combination + EXPECT_THAT( + tensorstore::Spec::FromJson(spec_with_both), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("\"field\" and \"open_as_void\" are mutually " + "exclusive"))); } TEST(ZarrDriverTest, OpenAsVoidUrlNotSupported) { diff --git a/tensorstore/driver/zarr/schema.yml b/tensorstore/driver/zarr/schema.yml index a90fb7e3a..af19214bb 100644 --- a/tensorstore/driver/zarr/schema.yml +++ b/tensorstore/driver/zarr/schema.yml @@ -8,23 +8,28 @@ allOf: oneOf: - const: zarr2 - const: zarr - field: - oneOf: - - type: string - - type: "null" - title: Name of field to open. - description: | - Must be specified if the `.metadata.dtype` specified in the array - metadata has more than one field. - default: null - open_as_void: - type: boolean - default: false - title: Raw byte access mode. - description: | - When true, opens the array as raw bytes instead of interpreting it - as structured data. The resulting array will have an additional - dimension representing the byte layout of each element. + oneOf: + - type: object + properties: + field: + oneOf: + - type: string + - type: "null" + title: Name of field to open. + description: | + Must be specified if the `.metadata.dtype` specified in the array + metadata has more than one field. + default: null + - type: object + properties: + open_as_void: + const: true + type: boolean + title: Raw byte access mode. + description: | + When true, opens the array as raw bytes instead of interpreting it + as structured data. The resulting array will have an additional + dimension representing the byte layout of each element. metadata: title: Zarr array metadata. description: | From 7fb91d751f24a1dd479fd389378f9e6ad33c8112 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 7 Jan 2026 16:15:32 +0000 Subject: [PATCH 10/14] Resolve feedback `https://github.com/google/tensorstore/pull/272#discussion_r2666195572` Use synthesized open_as_void field --- tensorstore/driver/zarr/driver.cc | 17 ++--------------- tensorstore/driver/zarr/dtype.cc | 18 ++++++++++++++++++ tensorstore/driver/zarr/dtype.h | 7 +++++++ tensorstore/driver/zarr/spec.cc | 15 +++++++++++++-- tensorstore/driver/zarr/spec.h | 8 ++++++++ 5 files changed, 48 insertions(+), 17 deletions(-) diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index 4151e69fe..8b8c3a0a4 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -142,21 +142,8 @@ absl::Status ZarrDriverSpec::ApplyOptions(SpecOptions&& options) { } Result ZarrDriverSpec::GetSpecInfo() const { - // For open_as_void, we don't use normal field resolution - // Note: When opening an existing array, dtype may not be known yet, - // so we can't determine the exact rank until metadata is loaded. - if (open_as_void && partial_metadata.dtype) { - SpecRankAndFieldInfo info; - info.full_rank = schema.rank(); - info.chunked_rank = partial_metadata.rank; - // For void access, add one dimension for the bytes - info.field_rank = 1; // The bytes dimension - if (info.chunked_rank != dynamic_rank) { - info.full_rank = info.chunked_rank + 1; - } - return info; - } - return GetSpecRankAndFieldInfo(partial_metadata, selected_field, schema); + return GetSpecRankAndFieldInfo(partial_metadata, selected_field, schema, + open_as_void); } TENSORSTORE_DEFINE_JSON_DEFAULT_BINDER( diff --git a/tensorstore/driver/zarr/dtype.cc b/tensorstore/driver/zarr/dtype.cc index c210fb810..144f5a352 100644 --- a/tensorstore/driver/zarr/dtype.cc +++ b/tensorstore/driver/zarr/dtype.cc @@ -336,6 +336,24 @@ char EndianIndicator(tensorstore::endian e) { return e == tensorstore::endian::little ? '<' : '>'; } +const ZarrDType::Field* ZarrDType::GetVoidField() const { + if (!void_field_cache_) { + const Index nbytes = bytes_per_outer_element; + void_field_cache_ = Field{ + {/*encoded_dtype=*/tensorstore::StrCat("|V", nbytes), + /*dtype=*/dtype_v<::tensorstore::dtypes::byte_t>, + /*endian=*/endian::native, + /*flexible_shape=*/{}}, + /*outer_shape=*/{}, + /*name=*/{}, + /*field_shape=*/{nbytes}, + /*num_inner_elements=*/nbytes, + /*byte_offset=*/0, + /*num_bytes=*/nbytes}; + } + return &*void_field_cache_; +} + Result ChooseBaseDType(DataType dtype) { ZarrDType::BaseDType base_dtype; base_dtype.endian = endian::native; diff --git a/tensorstore/driver/zarr/dtype.h b/tensorstore/driver/zarr/dtype.h index be858d671..1ae652f9b 100644 --- a/tensorstore/driver/zarr/dtype.h +++ b/tensorstore/driver/zarr/dtype.h @@ -114,11 +114,18 @@ struct ZarrDType { /// Bytes per "outer" element (derived value). Index bytes_per_outer_element; + /// Returns a synthesized field for raw byte access to the entire dtype. + /// The returned pointer is valid for the lifetime of this ZarrDType. + const Field* GetVoidField() const; + TENSORSTORE_DECLARE_JSON_DEFAULT_BINDER(ZarrDType, internal_json_binding::NoOptions) friend void to_json(::nlohmann::json& out, // NOLINT const ZarrDType& dtype); + + /// Lazily-computed cache for GetVoidField(). + mutable std::optional void_field_cache_; }; /// Parses a zarr metadata "dtype" JSON specification. diff --git a/tensorstore/driver/zarr/spec.cc b/tensorstore/driver/zarr/spec.cc index 38a74ce30..16ec03f1d 100644 --- a/tensorstore/driver/zarr/spec.cc +++ b/tensorstore/driver/zarr/spec.cc @@ -339,6 +339,13 @@ absl::Status ValidateSpecRankAndFieldInfo(SpecRankAndFieldInfo& info) { Result GetSpecRankAndFieldInfo( const ZarrPartialMetadata& metadata, const SelectedField& selected_field, const Schema& schema) { + return GetSpecRankAndFieldInfo(metadata, selected_field, schema, + /*open_as_void=*/false); +} + +Result GetSpecRankAndFieldInfo( + const ZarrPartialMetadata& metadata, const SelectedField& selected_field, + const Schema& schema, bool open_as_void) { SpecRankAndFieldInfo info; info.full_rank = schema.rank(); @@ -347,8 +354,12 @@ Result GetSpecRankAndFieldInfo( if (metadata.dtype) { TENSORSTORE_ASSIGN_OR_RETURN( size_t field_index, - GetFieldIndex(*metadata.dtype, selected_field, /*open_as_void=*/false)); - info.field = &metadata.dtype->fields[field_index]; + GetFieldIndex(*metadata.dtype, selected_field, open_as_void)); + if (field_index == kVoidFieldIndex) { + info.field = metadata.dtype->GetVoidField(); + } else { + info.field = &metadata.dtype->fields[field_index]; + } } TENSORSTORE_RETURN_IF_ERROR(ValidateSpecRankAndFieldInfo(info)); diff --git a/tensorstore/driver/zarr/spec.h b/tensorstore/driver/zarr/spec.h index bb4659023..b807cb984 100644 --- a/tensorstore/driver/zarr/spec.h +++ b/tensorstore/driver/zarr/spec.h @@ -97,6 +97,14 @@ Result GetSpecRankAndFieldInfo( const ZarrPartialMetadata& metadata, const SelectedField& selected_field, const Schema& schema); +/// Overload that supports open_as_void mode. +/// +/// When `open_as_void` is true and `metadata.dtype` is specified, `info.field` +/// points to the dtype's synthesized void field. +Result GetSpecRankAndFieldInfo( + const ZarrPartialMetadata& metadata, const SelectedField& selected_field, + const Schema& schema, bool open_as_void); + SpecRankAndFieldInfo GetSpecRankAndFieldInfo(const ZarrMetadata& metadata, size_t field_index); From 389d6a9aed665a0448401aeb55eaad0bdd7736d9 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 7 Jan 2026 21:00:37 +0000 Subject: [PATCH 11/14] Resolve feedback `https://github.com/google/tensorstore/pull/272/changes/BASE..7fb91d751f24a1dd479fd389378f9e6ad33c8112#r2669500148` Make the schema valid --- tensorstore/driver/zarr/schema.yml | 41 ++++++++++++++---------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/tensorstore/driver/zarr/schema.yml b/tensorstore/driver/zarr/schema.yml index af19214bb..91f45eb98 100644 --- a/tensorstore/driver/zarr/schema.yml +++ b/tensorstore/driver/zarr/schema.yml @@ -8,28 +8,25 @@ allOf: oneOf: - const: zarr2 - const: zarr - oneOf: - - type: object - properties: - field: - oneOf: - - type: string - - type: "null" - title: Name of field to open. - description: | - Must be specified if the `.metadata.dtype` specified in the array - metadata has more than one field. - default: null - - type: object - properties: - open_as_void: - const: true - type: boolean - title: Raw byte access mode. - description: | - When true, opens the array as raw bytes instead of interpreting it - as structured data. The resulting array will have an additional - dimension representing the byte layout of each element. + field: + oneOf: + - type: string + - type: "null" + title: Name of field to open. + description: | + Must be specified if the `.metadata.dtype` specified in the array + metadata has more than one field. Cannot be specified together with + :json:`"open_as_void": true`. + default: null + open_as_void: + type: boolean + title: Raw byte access mode. + description: | + When true, opens the array as raw bytes instead of interpreting it + as structured data. The resulting array will have an additional + dimension representing the byte layout of each element. Cannot be + :json:`true` if ``field`` is also specified. + default: false metadata: title: Zarr array metadata. description: | From 101011b6c095004348f68b22010757e804eaca7d Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 12 Jan 2026 18:46:05 +0000 Subject: [PATCH 12/14] Resolve `https://github.com/google/tensorstore/pull/272/changes#r2669223557` Compose ZarrMetadata with `open_as_void` field --- tensorstore/driver/zarr/driver.cc | 182 +++++------------------ tensorstore/driver/zarr/driver_impl.h | 29 ++-- tensorstore/driver/zarr/metadata.cc | 22 +++ tensorstore/driver/zarr/metadata.h | 10 ++ tensorstore/driver/zarr/metadata_test.cc | 88 +++++++++++ 5 files changed, 172 insertions(+), 159 deletions(-) diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index 8b8c3a0a4..1bc72d006 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -411,150 +411,44 @@ Result DataCache::GetChunkLayoutFromMetadata( std::string DataCache::GetBaseKvstorePath() { return key_prefix_; } // VoidDataCache implementation -VoidDataCache::VoidDataCache(Initializer&& initializer, std::string key_prefix, - DimensionSeparator dimension_separator, - std::string metadata_key) - : DataCache(std::move(initializer), std::move(key_prefix), - dimension_separator, std::move(metadata_key)) { - // Replace the grid with the void-specific grid specification - grid_ = GetVoidChunkGridSpecification(metadata()); -} +// Uses inherited DataCache constructor and encode/decode methods. +// The void metadata (with dtype containing only the void field) is created +// in GetDataCache and passed via the initializer, so standard encode/decode +// paths work correctly. -void VoidDataCache::GetChunkGridBounds(const void* metadata_ptr, - MutableBoxView<> bounds, - DimensionSet& implicit_lower_bounds, - DimensionSet& implicit_upper_bounds) { - const auto& metadata = *static_cast(metadata_ptr); - // Use >= assertion to allow for extra dimensions (the bytes dimension) - assert(bounds.rank() >= static_cast(metadata.shape.size())); - std::fill(bounds.origin().begin(), - bounds.origin().begin() + metadata.shape.size(), Index(0)); - std::copy(metadata.shape.begin(), metadata.shape.end(), - bounds.shape().begin()); - implicit_lower_bounds = false; - implicit_upper_bounds = false; - for (DimensionIndex i = 0; - i < static_cast(metadata.shape.size()); ++i) { - implicit_upper_bounds[i] = true; - } - // For void access, the extra dimension is the bytes_per_outer_element - if (static_cast(metadata.shape.size() + 1) == bounds.rank()) { - bounds.shape()[metadata.rank] = metadata.dtype.bytes_per_outer_element; - bounds.origin()[metadata.rank] = 0; - } -} - -internal::ChunkGridSpecification VoidDataCache::GetVoidChunkGridSpecification( - const ZarrMetadata& metadata) { - internal::ChunkGridSpecification::ComponentList components; - std::vector chunked_to_cell_dimensions( - metadata.chunks.size()); - std::iota(chunked_to_cell_dimensions.begin(), - chunked_to_cell_dimensions.end(), static_cast(0)); - - const Index bytes_per_element = metadata.dtype.bytes_per_outer_element; - - // Create a zero-filled byte array as the fill value - auto base_fill_value = AllocateArray( - span({bytes_per_element}), c_order, value_init, - dtype_v); - - // The full chunk shape includes the extra bytes dimension - std::vector chunk_shape_with_bytes = metadata.chunks; - chunk_shape_with_bytes.push_back(bytes_per_element); - - const DimensionIndex cell_rank = metadata.rank + 1; - - // Broadcast fill value to target shape [unbounded, ..., bytes_per_element] - // like zarr3 does - std::vector target_shape(metadata.rank, kInfIndex); - target_shape.push_back(bytes_per_element); - auto chunk_fill_value = - BroadcastArray(base_fill_value, BoxView<>(target_shape)).value(); +absl::Status VoidDataCache::ValidateMetadataCompatibility( + const void* existing_metadata_ptr, const void* new_metadata_ptr) { + assert(existing_metadata_ptr); + assert(new_metadata_ptr); + const auto& existing_metadata = + *static_cast(existing_metadata_ptr); + const auto& new_metadata = + *static_cast(new_metadata_ptr); - // Create valid data bounds - unbounded for chunked dimensions, - // explicit for bytes dimension - Box<> valid_data_bounds(cell_rank); - for (DimensionIndex i = 0; i < metadata.rank; ++i) { - valid_data_bounds[i] = IndexInterval::Infinite(); + // For void access, we only require that bytes_per_outer_element matches, + // since we're treating the data as raw bytes regardless of the actual dtype. + // Shape is allowed to differ (handled by base class for resizing). + // Other fields like compressor, order, chunks must still match. + if (existing_metadata.dtype.bytes_per_outer_element != + new_metadata.dtype.bytes_per_outer_element) { + return absl::FailedPreconditionError(tensorstore::StrCat( + "Void access metadata bytes_per_outer_element mismatch: existing=", + existing_metadata.dtype.bytes_per_outer_element, + ", new=", new_metadata.dtype.bytes_per_outer_element)); } - valid_data_bounds[metadata.rank] = - IndexInterval::UncheckedSized(0, bytes_per_element); - - // Create permutation: copy existing order and add the bytes dimension - DimensionIndex layout_order_buffer[kMaxRank]; - GetChunkInnerOrder(metadata.rank, metadata.order, - span(layout_order_buffer, metadata.rank)); - layout_order_buffer[metadata.rank] = metadata.rank; // Add bytes dimension - - components.emplace_back( - internal::AsyncWriteArray::Spec{ - std::move(chunk_fill_value), std::move(valid_data_bounds), - ContiguousLayoutPermutation<>(span(layout_order_buffer, cell_rank))}, - std::move(chunk_shape_with_bytes), chunked_to_cell_dimensions); - - return internal::ChunkGridSpecification{std::move(components)}; -} -Result, 1>> -VoidDataCache::DecodeChunk(span chunk_indices, absl::Cord data) { - // For void access, return raw bytes as a single component - const auto& md = metadata(); - - // Decompress the data first (if compressed) - absl::Cord decompressed = std::move(data); - if (md.compressor) { - riegeli::CordReader base_reader(std::move(decompressed)); - auto compressed_reader = md.compressor->GetReader( - base_reader, md.dtype.bytes_per_outer_element); - absl::Cord uncompressed; - TENSORSTORE_RETURN_IF_ERROR( - riegeli::ReadAll(std::move(compressed_reader), uncompressed)); - if (!base_reader.VerifyEndAndClose()) return base_reader.status(); - decompressed = std::move(uncompressed); + // Check that other critical fields match (same as base, but ignoring dtype) + if (existing_metadata.chunks != new_metadata.chunks) { + return absl::FailedPreconditionError("Chunk shape mismatch"); } - - // Build the shape: chunk_shape + bytes_per_element - std::vector shape = md.chunks; - shape.push_back(md.dtype.bytes_per_outer_element); - - // Create a byte array from the decompressed data - auto flat_data = decompressed.Flatten(); - auto byte_array = AllocateArray(shape, c_order, default_init, - dtype_v); - std::memcpy(byte_array.data(), flat_data.data(), - std::min(static_cast(byte_array.num_elements()), - flat_data.size())); - - absl::InlinedVector, 1> result; - result.push_back(std::move(byte_array)); - return result; -} - -Result VoidDataCache::EncodeChunk( - span chunk_indices, - span> component_arrays) { - // For void access, encode raw bytes directly - const auto& md = metadata(); - if (component_arrays.size() != 1) { - return absl::InvalidArgumentError( - "Expected exactly one component array for void access"); + if (existing_metadata.order != new_metadata.order) { + return absl::FailedPreconditionError("Order mismatch"); } - absl::Cord uncompressed = internal::MakeCordFromSharedPtr( - component_arrays[0].pointer(), component_arrays[0].num_elements()); - - // Compress if needed - if (md.compressor) { - absl::Cord encoded; - riegeli::CordWriter base_writer(&encoded); - auto writer = md.compressor->GetWriter(base_writer, - md.dtype.bytes_per_outer_element); - TENSORSTORE_RETURN_IF_ERROR( - riegeli::Write(std::move(uncompressed), std::move(writer))); - if (!base_writer.Close()) return base_writer.status(); - return encoded; + if (existing_metadata.compressor != new_metadata.compressor) { + return absl::FailedPreconditionError("Compressor mismatch"); } - return uncompressed; + + return absl::OkStatus(); } absl::Status VoidDataCache::GetBoundSpecData( @@ -693,17 +587,21 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { std::unique_ptr GetDataCache( DataCache::Initializer&& initializer) override { - const auto& metadata = + const auto& original_metadata = *static_cast(initializer.metadata.get()); + auto dim_sep = GetDimensionSeparator(spec().partial_metadata, original_metadata); if (spec().open_as_void) { + // Create void metadata from the original. This modifies the dtype to + // contain only the void field, allowing standard encode/decode to work. + // CreateVoidMetadata uses the same chunks and bytes_per_outer_element as + // the original validated metadata, so it should never fail. + initializer.metadata = CreateVoidMetadata(original_metadata).value(); return std::make_unique( - std::move(initializer), spec().store.path, - GetDimensionSeparator(spec().partial_metadata, metadata), + std::move(initializer), spec().store.path, dim_sep, spec().metadata_key); } return std::make_unique( - std::move(initializer), spec().store.path, - GetDimensionSeparator(spec().partial_metadata, metadata), + std::move(initializer), spec().store.path, dim_sep, spec().metadata_key); } diff --git a/tensorstore/driver/zarr/driver_impl.h b/tensorstore/driver/zarr/driver_impl.h index fe5e2c6f1..343383554 100644 --- a/tensorstore/driver/zarr/driver_impl.h +++ b/tensorstore/driver/zarr/driver_impl.h @@ -147,26 +147,21 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache { }; /// Derived DataCache for open_as_void mode that provides raw byte access. +/// +/// The void metadata (created via CreateVoidMetadata) has dtype.fields +/// containing only the void field, so inherited encode/decode methods +/// work correctly for raw byte access. GetBoundSpecData is overridden +/// to set open_as_void=true in the spec, and ValidateMetadataCompatibility +/// is overridden to allow different dtypes with the same bytes_per_outer_element. class VoidDataCache : public DataCache { public: - explicit VoidDataCache(Initializer&& initializer, std::string key_prefix, - DimensionSeparator dimension_separator, - std::string metadata_key); + using DataCache::DataCache; - void GetChunkGridBounds(const void* metadata_ptr, MutableBoxView<> bounds, - DimensionSet& implicit_lower_bounds, - DimensionSet& implicit_upper_bounds) override; - - /// Returns the ChunkCache grid for void (raw byte) access. - static internal::ChunkGridSpecification GetVoidChunkGridSpecification( - const ZarrMetadata& metadata); - - Result, 1>> DecodeChunk( - span chunk_indices, absl::Cord data) override; - - Result EncodeChunk( - span chunk_indices, - span> component_arrays) override; + /// For void access, metadata is compatible if bytes_per_outer_element matches, + /// regardless of the actual dtype (since we treat everything as raw bytes). + absl::Status ValidateMetadataCompatibility( + const void* existing_metadata_ptr, + const void* new_metadata_ptr) override; absl::Status GetBoundSpecData( internal_kvs_backed_chunk_driver::KvsDriverSpec& spec_base, diff --git a/tensorstore/driver/zarr/metadata.cc b/tensorstore/driver/zarr/metadata.cc index 75bef0676..77ca20ce1 100644 --- a/tensorstore/driver/zarr/metadata.cc +++ b/tensorstore/driver/zarr/metadata.cc @@ -366,6 +366,28 @@ Result ComputeChunkLayout( return layout; } +Result CreateVoidMetadata(const ZarrMetadata& original) { + auto metadata = std::make_shared(original); + + // Replace dtype with void dtype (single void field) + const auto* void_field = original.dtype.GetVoidField(); + metadata->dtype.has_fields = false; + metadata->dtype.fields = {*void_field}; + // bytes_per_outer_element stays the same (inherited from copy) + + // Set fill_value for the single void field. + // Empty/null fill value is handled by GetChunkGridSpecification. + metadata->fill_value.resize(1); + + // Recompute chunk_layout using existing ValidateMetadata. + // ComputeChunkLayout handles the void field correctly because + // void_field.num_bytes == bytes_per_outer_element, producing + // matching encoded/decoded layouts as required by DecodeChunk. + TENSORSTORE_RETURN_IF_ERROR(ValidateMetadata(*metadata)); + + return metadata; +} + constexpr auto MetadataJsonBinder = [](auto maybe_optional) { return [=](auto is_loading, const auto& options, auto* obj, auto* j) { using T = absl::remove_cvref_t; diff --git a/tensorstore/driver/zarr/metadata.h b/tensorstore/driver/zarr/metadata.h index 3a643e407..25afe531b 100644 --- a/tensorstore/driver/zarr/metadata.h +++ b/tensorstore/driver/zarr/metadata.h @@ -199,6 +199,16 @@ Result ComputeChunkLayout( const ZarrDType& dtype, ContiguousLayoutOrder order, tensorstore::span chunk_shape); +/// Creates a modified ZarrMetadata for void (raw byte) access. +/// +/// The returned metadata has dtype.fields containing only the void field +/// (from GetVoidField()), allowing standard encode/decode paths to work +/// with raw bytes. The chunk_layout is recomputed accordingly. +/// +/// \param original The original metadata to base the void metadata on. +/// \returns A new metadata suitable for void access. +Result CreateVoidMetadata(const ZarrMetadata& original); + /// Encodes the field fill values as a zarr metadata "fill_value" JSON /// specification. /// diff --git a/tensorstore/driver/zarr/metadata_test.cc b/tensorstore/driver/zarr/metadata_test.cc index 8778ce03d..11b64a874 100644 --- a/tensorstore/driver/zarr/metadata_test.cc +++ b/tensorstore/driver/zarr/metadata_test.cc @@ -804,4 +804,92 @@ TEST(DimensionSeparatorTest, JsonBinderTestInvalid) { DimensionSeparatorJsonBinder); } +using ::tensorstore::internal_zarr::CreateVoidMetadata; + +TEST(CreateVoidMetadataTest, SimpleType) { + // Create metadata for a simple int32 type + std::string_view metadata_text = R"({ + "chunks": [10, 10], + "compressor": null, + "dtype": "rank, original.rank); + EXPECT_EQ(void_metadata->shape, original.shape); + EXPECT_EQ(void_metadata->chunks, original.chunks); + EXPECT_EQ(void_metadata->order, original.order); + EXPECT_EQ(void_metadata->zarr_format, original.zarr_format); + + // Verify dtype is converted to void type + EXPECT_FALSE(void_metadata->dtype.has_fields); + EXPECT_EQ(1, void_metadata->dtype.fields.size()); + EXPECT_EQ(dtype_v, void_metadata->dtype.fields[0].dtype); + EXPECT_EQ(original.dtype.bytes_per_outer_element, + void_metadata->dtype.bytes_per_outer_element); + + // Verify field_shape is {bytes_per_outer_element} + EXPECT_THAT(void_metadata->dtype.fields[0].field_shape, + ElementsAre(original.dtype.bytes_per_outer_element)); + + // Verify chunk_layout is computed correctly + EXPECT_EQ(void_metadata->chunk_layout.num_outer_elements, + original.chunk_layout.num_outer_elements); + EXPECT_EQ(void_metadata->chunk_layout.bytes_per_chunk, + original.chunk_layout.bytes_per_chunk); + EXPECT_EQ(1, void_metadata->chunk_layout.fields.size()); + + // Verify encoded and decoded layouts are equal (required for single-field + // optimized decode path) + EXPECT_EQ(void_metadata->chunk_layout.fields[0].encoded_chunk_layout, + void_metadata->chunk_layout.fields[0].decoded_chunk_layout); +} + +TEST(CreateVoidMetadataTest, StructuredType) { + // Create metadata for a structured dtype with multiple fields + std::string_view metadata_text = R"({ + "chunks": [10], + "compressor": {"id": "zlib", "level": 1}, + "dtype": [["a", "dtype.has_fields); + EXPECT_EQ(1, void_metadata->dtype.fields.size()); + EXPECT_EQ(dtype_v, void_metadata->dtype.fields[0].dtype); + + // bytes_per_outer_element should be 4 + 10 = 14 + EXPECT_EQ(14, original.dtype.bytes_per_outer_element); + EXPECT_EQ(14, void_metadata->dtype.bytes_per_outer_element); + EXPECT_THAT(void_metadata->dtype.fields[0].field_shape, ElementsAre(14)); + + // Verify compressor is preserved + EXPECT_EQ(void_metadata->compressor, original.compressor); + + // Verify order is preserved + EXPECT_EQ(void_metadata->order, original.order); +} + } // namespace From 62fd8f99b09b6225bae02ccdd6ff1d3b2e8e4265 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 12 Jan 2026 19:01:23 +0000 Subject: [PATCH 13/14] Resolve `https://github.com/google/tensorstore/pull/272/changes#r2669205910` Add error check for creating a structured Store but not providing the dtype --- tensorstore/driver/zarr/spec.cc | 5 +++++ tensorstore/driver/zarr/spec_test.cc | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/tensorstore/driver/zarr/spec.cc b/tensorstore/driver/zarr/spec.cc index 16ec03f1d..825eeecf5 100644 --- a/tensorstore/driver/zarr/spec.cc +++ b/tensorstore/driver/zarr/spec.cc @@ -186,6 +186,11 @@ Result GetNewMetadata( "\"dtype\" must be specified in \"metadata\" if \"field\" is " "specified"); } + if (open_as_void) { + return absl::InvalidArgumentError( + "\"dtype\" must be specified in \"metadata\" if \"open_as_void\" is " + "specified"); + } if (!schema.dtype().valid()) { return absl::InvalidArgumentError("\"dtype\" must be specified"); } diff --git a/tensorstore/driver/zarr/spec_test.cc b/tensorstore/driver/zarr/spec_test.cc index 81f105735..eb4d1892e 100644 --- a/tensorstore/driver/zarr/spec_test.cc +++ b/tensorstore/driver/zarr/spec_test.cc @@ -638,6 +638,24 @@ TEST(GetNewMetadataTest, SelectedFieldDtypeNotSpecified) { "\"field\" is specified"))); } +TEST(GetNewMetadataTest, OpenAsVoidDtypeNotSpecified) { + // When open_as_void=true, dtype must be specified in metadata because + // open_as_void is for accessing existing structured data as raw bytes. + Schema schema; + TENSORSTORE_ASSERT_OK(schema.Set(Schema::Shape({100, 200}))); + TENSORSTORE_ASSERT_OK(schema.Set(dtype_v)); + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto partial_metadata, + ZarrPartialMetadata::FromJson(::nlohmann::json::object_t())); + EXPECT_THAT( + tensorstore::internal_zarr::GetNewMetadata(partial_metadata, + /*selected_field=*/{}, schema, + /*open_as_void=*/true), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("\"dtype\" must be specified in \"metadata\" if " + "\"open_as_void\" is specified"))); +} + TEST(GetNewMetadataTest, SelectedFieldInvalid) { EXPECT_THAT( GetNewMetadataFromOptions({{"dtype", {{"x", " Date: Mon, 12 Jan 2026 19:48:07 +0000 Subject: [PATCH 14/14] Resolve `https://github.com/google/tensorstore/pull/272/changes#r2669216841` Simplify field index logic for `open_as_void` case --- tensorstore/driver/zarr/driver.cc | 22 +++++++-------- tensorstore/driver/zarr/spec.cc | 41 +++++++++++----------------- tensorstore/driver/zarr/spec.h | 9 ++---- tensorstore/driver/zarr/spec_test.cc | 12 ++++---- 4 files changed, 35 insertions(+), 49 deletions(-) diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index 1bc72d006..358fce9b9 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -224,18 +224,17 @@ Result> ZarrDriverSpec::GetFillValue( const auto& metadata = partial_metadata; if (metadata.dtype && metadata.fill_value) { - TENSORSTORE_ASSIGN_OR_RETURN( - size_t field_index, - GetFieldIndex(*metadata.dtype, selected_field, open_as_void)); - // For void access, synthesize a byte-level fill value - if (field_index == kVoidFieldIndex) { + if (open_as_void) { const Index nbytes = metadata.dtype->bytes_per_outer_element; auto byte_arr = AllocateArray( span({nbytes}), c_order, value_init, dtype_v); fill_value = byte_arr; } else { + TENSORSTORE_ASSIGN_OR_RETURN( + size_t field_index, + GetFieldIndex(*metadata.dtype, selected_field)); fill_value = (*metadata.fill_value)[field_index]; } } @@ -610,14 +609,15 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { const auto& metadata = *static_cast(metadata_ptr); TENSORSTORE_RETURN_IF_ERROR( ValidateMetadata(metadata, spec().partial_metadata)); - TENSORSTORE_ASSIGN_OR_RETURN( - auto field_index, - GetFieldIndex(metadata.dtype, spec().selected_field, - spec().open_as_void)); - // For void access, map to component index 0 since we create a special + // For void access, use component index 0 since we create a special // component for raw byte access - if (field_index == kVoidFieldIndex) { + size_t field_index; + if (spec().open_as_void) { field_index = 0; + } else { + TENSORSTORE_ASSIGN_OR_RETURN( + field_index, + GetFieldIndex(metadata.dtype, spec().selected_field)); } TENSORSTORE_RETURN_IF_ERROR( ValidateMetadataSchema(metadata, field_index, spec().schema)); diff --git a/tensorstore/driver/zarr/spec.cc b/tensorstore/driver/zarr/spec.cc index 825eeecf5..f374db197 100644 --- a/tensorstore/driver/zarr/spec.cc +++ b/tensorstore/driver/zarr/spec.cc @@ -168,16 +168,16 @@ Result GetNewMetadata( // before validating the domain. size_t selected_field_index = 0; + const ZarrDType::Field* field_ptr = nullptr; if (partial_metadata.dtype) { - // If a zarr dtype is specified explicitly, determine the field index. If a - // multi-field zarr dtype is desired, it must be specified explicitly. - TENSORSTORE_ASSIGN_OR_RETURN( - selected_field_index, - GetFieldIndex(*partial_metadata.dtype, selected_field, open_as_void)); - // For void access, use field 0 for metadata creation since we use all - // fields as raw bytes - if (selected_field_index == kVoidFieldIndex) { - selected_field_index = 0; + if (open_as_void) { + field_ptr = partial_metadata.dtype->GetVoidField(); + } else { + TENSORSTORE_ASSIGN_OR_RETURN( + size_t field_index, + GetFieldIndex(*partial_metadata.dtype, selected_field)); + field_ptr = &partial_metadata.dtype->fields[field_index]; + selected_field_index = field_index; } metadata->dtype = *partial_metadata.dtype; } else { @@ -202,8 +202,9 @@ Result GetNewMetadata( static_cast(field), internal_zarr::ChooseBaseDType(schema.dtype())); TENSORSTORE_RETURN_IF_ERROR(ValidateDType(metadata->dtype)); + field_ptr = &field; } - auto& field = metadata->dtype.fields[selected_field_index]; + auto& field = *field_ptr; SpecRankAndFieldInfo info; info.full_rank = schema.rank(); @@ -357,12 +358,12 @@ Result GetSpecRankAndFieldInfo( info.chunked_rank = metadata.rank; if (metadata.dtype) { - TENSORSTORE_ASSIGN_OR_RETURN( - size_t field_index, - GetFieldIndex(*metadata.dtype, selected_field, open_as_void)); - if (field_index == kVoidFieldIndex) { + if (open_as_void) { info.field = metadata.dtype->GetVoidField(); } else { + TENSORSTORE_ASSIGN_OR_RETURN( + size_t field_index, + GetFieldIndex(*metadata.dtype, selected_field)); info.field = &metadata.dtype->fields[field_index]; } } @@ -550,17 +551,7 @@ std::string GetFieldNames(const ZarrDType& dtype) { } // namespace Result GetFieldIndex(const ZarrDType& dtype, - const SelectedField& selected_field, - bool open_as_void) { - // Special case: open_as_void requests raw byte access (works for any dtype) - if (open_as_void) { - if (dtype.fields.empty()) { - return absl::FailedPreconditionError( - "Requested void access but dtype has no fields"); - } - return kVoidFieldIndex; - } - + const SelectedField& selected_field) { if (selected_field.empty()) { if (dtype.fields.size() != 1) { return absl::FailedPreconditionError(tensorstore::StrCat( diff --git a/tensorstore/driver/zarr/spec.h b/tensorstore/driver/zarr/spec.h index b807cb984..92436755c 100644 --- a/tensorstore/driver/zarr/spec.h +++ b/tensorstore/driver/zarr/spec.h @@ -144,16 +144,11 @@ Result ParseSelectedField(const ::nlohmann::json& value); /// \param dtype The parsed zarr "dtype" specification. /// \param selected_field The label of the field, or an empty string to indicate /// that the zarr array must have only a single field. -/// \param open_as_void If true, returns kVoidFieldIndex for raw byte access. -/// \returns The field index, or kVoidFieldIndex if open_as_void is true. +/// \returns The field index. /// \error `absl::StatusCode::kFailedPrecondition` if `selected_field` is not /// valid. Result GetFieldIndex(const ZarrDType& dtype, - const SelectedField& selected_field, - bool open_as_void); - -/// Special field index indicating void (raw byte) access. -constexpr size_t kVoidFieldIndex = size_t(-1); + const SelectedField& selected_field); /// Encodes a field index as a `SelectedField` JSON specification. /// diff --git a/tensorstore/driver/zarr/spec_test.cc b/tensorstore/driver/zarr/spec_test.cc index eb4d1892e..660e12429 100644 --- a/tensorstore/driver/zarr/spec_test.cc +++ b/tensorstore/driver/zarr/spec_test.cc @@ -187,12 +187,12 @@ TEST(ParseSelectedFieldTest, InvalidType) { } TEST(GetFieldIndexTest, Null) { - EXPECT_EQ(0u, GetFieldIndex(ParseDType("