diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index 69164648e..358fce9b9 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -29,6 +29,10 @@ #include "absl/status/status.h" #include "absl/strings/cord.h" #include +#include "riegeli/bytes/cord_reader.h" +#include "riegeli/bytes/cord_writer.h" +#include "riegeli/bytes/read_all.h" +#include "riegeli/bytes/write.h" #include "tensorstore/array.h" #include "tensorstore/array_storage_statistics.h" #include "tensorstore/box.h" @@ -55,6 +59,7 @@ #include "tensorstore/internal/chunk_grid_specification.h" #include "tensorstore/internal/grid_storage_statistics.h" #include "tensorstore/internal/intrusive_ptr.h" +#include "tensorstore/internal/riegeli/array_endian_codec.h" #include "tensorstore/internal/json_binding/bindable.h" #include "tensorstore/internal/json_binding/json_binding.h" #include "tensorstore/internal/uri_utils.h" @@ -137,7 +142,8 @@ absl::Status ZarrDriverSpec::ApplyOptions(SpecOptions&& options) { } Result ZarrDriverSpec::GetSpecInfo() const { - return GetSpecRankAndFieldInfo(partial_metadata, selected_field, schema); + return GetSpecRankAndFieldInfo(partial_metadata, selected_field, schema, + open_as_void); } TENSORSTORE_DEFINE_JSON_DEFAULT_BINDER( @@ -171,7 +177,16 @@ TENSORSTORE_DEFINE_JSON_DEFAULT_BINDER( jb::Member("field", jb::Projection<&ZarrDriverSpec::selected_field>( jb::DefaultValue( [](auto* obj) { *obj = std::string{}; }))), + jb::Member("open_as_void", + jb::Projection<&ZarrDriverSpec::open_as_void>( + jb::DefaultValue( + [](auto* v) { *v = false; }))), jb::Initialize([](auto* obj) { + // Validate that field and open_as_void are mutually exclusive + if (obj->open_as_void && !obj->selected_field.empty()) { + return absl::InvalidArgumentError( + "\"field\" and \"open_as_void\" are mutually exclusive"); + } TENSORSTORE_ASSIGN_OR_RETURN(auto info, obj->GetSpecInfo()); if (info.full_rank != dynamic_rank) { TENSORSTORE_RETURN_IF_ERROR( @@ -209,9 +224,19 @@ Result> ZarrDriverSpec::GetFillValue( const auto& metadata = partial_metadata; if (metadata.dtype && metadata.fill_value) { - TENSORSTORE_ASSIGN_OR_RETURN( - size_t field_index, GetFieldIndex(*metadata.dtype, selected_field)); - fill_value = (*metadata.fill_value)[field_index]; + // For void access, synthesize a byte-level fill value + if (open_as_void) { + const Index nbytes = metadata.dtype->bytes_per_outer_element; + auto byte_arr = AllocateArray( + span({nbytes}), c_order, value_init, + dtype_v); + fill_value = byte_arr; + } else { + TENSORSTORE_ASSIGN_OR_RETURN( + size_t field_index, + GetFieldIndex(*metadata.dtype, selected_field)); + fill_value = (*metadata.fill_value)[field_index]; + } } if (!fill_value.valid() || !transform.valid()) { @@ -356,6 +381,7 @@ absl::Status DataCache::GetBoundSpecData( const auto& metadata = *static_cast(metadata_ptr); spec.selected_field = EncodeSelectedField(component_index, metadata.dtype); spec.metadata_key = metadata_key_; + spec.open_as_void = false; auto& pm = spec.partial_metadata; pm.rank = metadata.rank; pm.zarr_format = metadata.zarr_format; @@ -382,6 +408,58 @@ Result DataCache::GetChunkLayoutFromMetadata( } std::string DataCache::GetBaseKvstorePath() { return key_prefix_; } + +// VoidDataCache implementation +// Uses inherited DataCache constructor and encode/decode methods. +// The void metadata (with dtype containing only the void field) is created +// in GetDataCache and passed via the initializer, so standard encode/decode +// paths work correctly. + +absl::Status VoidDataCache::ValidateMetadataCompatibility( + const void* existing_metadata_ptr, const void* new_metadata_ptr) { + assert(existing_metadata_ptr); + assert(new_metadata_ptr); + const auto& existing_metadata = + *static_cast(existing_metadata_ptr); + const auto& new_metadata = + *static_cast(new_metadata_ptr); + + // For void access, we only require that bytes_per_outer_element matches, + // since we're treating the data as raw bytes regardless of the actual dtype. + // Shape is allowed to differ (handled by base class for resizing). + // Other fields like compressor, order, chunks must still match. + if (existing_metadata.dtype.bytes_per_outer_element != + new_metadata.dtype.bytes_per_outer_element) { + return absl::FailedPreconditionError(tensorstore::StrCat( + "Void access metadata bytes_per_outer_element mismatch: existing=", + existing_metadata.dtype.bytes_per_outer_element, + ", new=", new_metadata.dtype.bytes_per_outer_element)); + } + + // Check that other critical fields match (same as base, but ignoring dtype) + if (existing_metadata.chunks != new_metadata.chunks) { + return absl::FailedPreconditionError("Chunk shape mismatch"); + } + if (existing_metadata.order != new_metadata.order) { + return absl::FailedPreconditionError("Order mismatch"); + } + if (existing_metadata.compressor != new_metadata.compressor) { + return absl::FailedPreconditionError("Compressor mismatch"); + } + + return absl::OkStatus(); +} + +absl::Status VoidDataCache::GetBoundSpecData( + internal_kvs_backed_chunk_driver::KvsDriverSpec& spec_base, + const void* metadata_ptr, size_t component_index) { + TENSORSTORE_RETURN_IF_ERROR( + DataCache::GetBoundSpecData(spec_base, metadata_ptr, component_index)); + auto& spec = static_cast(spec_base); + spec.open_as_void = true; + return absl::OkStatus(); +} + Result ZarrDriver::GetCodec() { return internal_zarr::GetCodecSpecFromMetadata(metadata()); } @@ -416,6 +494,10 @@ Result ZarrDriverSpec::ToUrl() const { return absl::InvalidArgumentError( "zarr2 URL syntax not supported with selected_field specified"); } + if (open_as_void) { + return absl::InvalidArgumentError( + "zarr2 URL syntax not supported with open_as_void specified"); + } TENSORSTORE_ASSIGN_OR_RETURN(auto base_url, store.ToUrl()); return tensorstore::StrCat(base_url, "|", kUrlScheme, ":"); } @@ -451,7 +533,7 @@ Future ZarrDriver::GetStorageStatistics( /*chunk_shape=*/grid.chunk_shape, /*shape=*/metadata->shape, /*dimension_separator=*/ - GetDimensionSeparatorChar(cache->dimension_separator_), + GetDimensionSeparatorChar(cache->dimension_separator()), staleness_bound, request.options)); }), std::move(promise), std::move(metadata_future)); @@ -483,7 +565,8 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { TENSORSTORE_ASSIGN_OR_RETURN( auto metadata, internal_zarr::GetNewMetadata(spec().partial_metadata, - spec().selected_field, spec().schema), + spec().selected_field, spec().schema, + spec().open_as_void), tensorstore::MaybeAnnotateStatus( _, "Cannot create using specified \"metadata\" and schema")); return metadata; @@ -496,17 +579,28 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { internal::EncodeCacheKey( &result, spec.store.path, GetDimensionSeparator(spec.partial_metadata, zarr_metadata), - zarr_metadata, spec.metadata_key); + zarr_metadata, spec.metadata_key, + spec.open_as_void ? "void" : "normal"); return result; } std::unique_ptr GetDataCache( DataCache::Initializer&& initializer) override { - const auto& metadata = + const auto& original_metadata = *static_cast(initializer.metadata.get()); + auto dim_sep = GetDimensionSeparator(spec().partial_metadata, original_metadata); + if (spec().open_as_void) { + // Create void metadata from the original. This modifies the dtype to + // contain only the void field, allowing standard encode/decode to work. + // CreateVoidMetadata uses the same chunks and bytes_per_outer_element as + // the original validated metadata, so it should never fail. + initializer.metadata = CreateVoidMetadata(original_metadata).value(); + return std::make_unique( + std::move(initializer), spec().store.path, dim_sep, + spec().metadata_key); + } return std::make_unique( - std::move(initializer), spec().store.path, - GetDimensionSeparator(spec().partial_metadata, metadata), + std::move(initializer), spec().store.path, dim_sep, spec().metadata_key); } @@ -515,8 +609,16 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { const auto& metadata = *static_cast(metadata_ptr); TENSORSTORE_RETURN_IF_ERROR( ValidateMetadata(metadata, spec().partial_metadata)); - TENSORSTORE_ASSIGN_OR_RETURN( - auto field_index, GetFieldIndex(metadata.dtype, spec().selected_field)); + // For void access, use component index 0 since we create a special + // component for raw byte access + size_t field_index; + if (spec().open_as_void) { + field_index = 0; + } else { + TENSORSTORE_ASSIGN_OR_RETURN( + field_index, + GetFieldIndex(metadata.dtype, spec().selected_field)); + } TENSORSTORE_RETURN_IF_ERROR( ValidateMetadataSchema(metadata, field_index, spec().schema)); return field_index; diff --git a/tensorstore/driver/zarr/driver_impl.h b/tensorstore/driver/zarr/driver_impl.h index df3c3930f..343383554 100644 --- a/tensorstore/driver/zarr/driver_impl.h +++ b/tensorstore/driver/zarr/driver_impl.h @@ -63,10 +63,11 @@ class ZarrDriverSpec ZarrPartialMetadata partial_metadata; SelectedField selected_field; std::string metadata_key; + bool open_as_void = false; constexpr static auto ApplyMembers = [](auto& x, auto f) { return f(internal::BaseCast(x), x.partial_metadata, - x.selected_field, x.metadata_key); + x.selected_field, x.metadata_key, x.open_as_void); }; absl::Status ApplyOptions(SpecOptions&& options) override; @@ -137,11 +138,36 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache { std::string GetBaseKvstorePath() override; + DimensionSeparator dimension_separator() const { return dimension_separator_; } + + protected: std::string key_prefix_; DimensionSeparator dimension_separator_; std::string metadata_key_; }; +/// Derived DataCache for open_as_void mode that provides raw byte access. +/// +/// The void metadata (created via CreateVoidMetadata) has dtype.fields +/// containing only the void field, so inherited encode/decode methods +/// work correctly for raw byte access. GetBoundSpecData is overridden +/// to set open_as_void=true in the spec, and ValidateMetadataCompatibility +/// is overridden to allow different dtypes with the same bytes_per_outer_element. +class VoidDataCache : public DataCache { + public: + using DataCache::DataCache; + + /// For void access, metadata is compatible if bytes_per_outer_element matches, + /// regardless of the actual dtype (since we treat everything as raw bytes). + absl::Status ValidateMetadataCompatibility( + const void* existing_metadata_ptr, + const void* new_metadata_ptr) override; + + absl::Status GetBoundSpecData( + internal_kvs_backed_chunk_driver::KvsDriverSpec& spec_base, + const void* metadata_ptr, size_t component_index) override; +}; + class ZarrDriver; using ZarrDriverBase = internal_kvs_backed_chunk_driver::RegisteredKvsDriver< ZarrDriver, ZarrDriverSpec, DataCache, diff --git a/tensorstore/driver/zarr/driver_test.cc b/tensorstore/driver/zarr/driver_test.cc index 92c5be48a..dc670e9a5 100644 --- a/tensorstore/driver/zarr/driver_test.cc +++ b/tensorstore/driver/zarr/driver_test.cc @@ -3499,4 +3499,681 @@ TEST(DriverTest, UrlSchemeRoundtrip) { {"kvstore", {{"driver", "memory"}, {"path", "abc.zarr/def/"}}}}); } +// Tests for open_as_void functionality + +TEST(ZarrDriverTest, OpenAsVoidSimpleType) { + // Test open_as_void with a simple data type (int16) + auto context = Context::Default(); + + // First create a normal array + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", "({{1, 2}, {3, 4}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Now open with open_as_void=true + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be the size of the data type (2 bytes for int16) + EXPECT_EQ(2, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); +} + +TEST(ZarrDriverTest, OpenAsVoidStructuredType) { + // Test open_as_void with a structured data type + auto context = Context::Default(); + + // Create an array with a structured dtype + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"field", "y"}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", ::nlohmann::json::array_t{{"x", "|u1"}, {"y", "({{100, 200}, {300, 400}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Now open with open_as_void=true - this should give raw access to the entire + // struct + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be 3 bytes (1 byte for u1 + 2 bytes for i2) + EXPECT_EQ(3, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); +} + +TEST(ZarrDriverTest, OpenAsVoidWithCompression) { + // Test open_as_void with compression enabled + auto context = Context::Default(); + + // Create an array with blosc compression + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"compressor", {{"id", "blosc"}}}, + {"dtype", "({{0x01020304, 0x05060708}, + {0x090a0b0c, 0x0d0e0f10}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Now open with open_as_void=true + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be 4 bytes for int32 + EXPECT_EQ(4, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); + + // Read the raw bytes and verify decompression works + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto read_result, + tensorstore::Read(void_store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + EXPECT_EQ(read_result.shape()[0], 2); + EXPECT_EQ(read_result.shape()[1], 2); + EXPECT_EQ(read_result.shape()[2], 4); +} + +TEST(ZarrDriverTest, OpenAsVoidSpecRoundtrip) { + // Test that open_as_void is properly preserved in spec round-trips + ::nlohmann::json json_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", "({{0x0102, 0x0304}, + {0x0506, 0x0708}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(data, store).result()); + + // Open as void and read + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Read the raw bytes + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto bytes_read, + tensorstore::Read(void_store).result()); + + // Verify shape: [2, 2, 2] where last dim is 2 bytes per uint16 + EXPECT_EQ(bytes_read.shape()[0], 2); + EXPECT_EQ(bytes_read.shape()[1], 2); + EXPECT_EQ(bytes_read.shape()[2], 2); + + // Verify the raw bytes (little endian) + auto bytes_ptr = static_cast(bytes_read.data()); + // First element: 0x0102 -> bytes 0x02, 0x01 (little endian) + EXPECT_EQ(bytes_ptr[0], 0x02); + EXPECT_EQ(bytes_ptr[1], 0x01); +} + +TEST(ZarrDriverTest, OpenAsVoidWriteRoundtrip) { + // Test that writing through open_as_void correctly encodes data + // and can be read back both through void access and normal typed access. + auto context = Context::Default(); + + // Create an array + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", "({{0, 0}, {0, 0}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(zeros, store).result()); + + // Open as void for writing + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Create raw bytes representing uint16 values in little endian: + // [0,0]: 0x1234 -> {0x34, 0x12}, [0,1]: 0x5678 -> {0x78, 0x56} + // [1,0]: 0x9ABC -> {0xBC, 0x9A}, [1,1]: 0xDEF0 -> {0xF0, 0xDE} + auto raw_bytes = tensorstore::AllocateArray( + {2, 2, 2}, tensorstore::c_order, tensorstore::value_init); + auto raw_bytes_ptr = static_cast( + const_cast(static_cast(raw_bytes.data()))); + // Element [0,0] = 0x1234 + raw_bytes_ptr[0] = 0x34; + raw_bytes_ptr[1] = 0x12; + // Element [0,1] = 0x5678 + raw_bytes_ptr[2] = 0x78; + raw_bytes_ptr[3] = 0x56; + // Element [1,0] = 0x9ABC + raw_bytes_ptr[4] = 0xBC; + raw_bytes_ptr[5] = 0x9A; + // Element [1,1] = 0xDEF0 + raw_bytes_ptr[6] = 0xF0; + raw_bytes_ptr[7] = 0xDE; + + // Write raw bytes through void access + TENSORSTORE_EXPECT_OK(tensorstore::Write(raw_bytes, void_store).result()); + + // Read back through void access and verify + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto bytes_read, + tensorstore::Read(void_store).result()); + auto bytes_read_ptr = static_cast(bytes_read.data()); + EXPECT_EQ(bytes_read_ptr[0], 0x34); + EXPECT_EQ(bytes_read_ptr[1], 0x12); + + // Read back through normal typed access and verify the values + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto typed_store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto typed_read, + tensorstore::Read(typed_store).result()); + auto typed_ptr = static_cast(typed_read.data()); + EXPECT_EQ(typed_ptr[0], 0x1234); + EXPECT_EQ(typed_ptr[1], 0x5678); + EXPECT_EQ(typed_ptr[2], 0x9ABC); + EXPECT_EQ(typed_ptr[3], 0xDEF0); +} + +TEST(ZarrDriverTest, OpenAsVoidWriteWithCompression) { + // Test writing through open_as_void with compression enabled. + // Verifies that the EncodeChunk method correctly compresses data. + auto context = Context::Default(); + + // Create an array with blosc compression + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"compressor", {{"id", "blosc"}}}, + {"dtype", "( + {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(zeros, store).result()); + + // Open as void for writing + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Create raw bytes representing int32 values in little endian + // Using a simple pattern: 0x01020304 at position [0,0] + auto raw_bytes = tensorstore::AllocateArray( + {4, 4, 4}, tensorstore::c_order, tensorstore::value_init); + + // Set first element to 0x01020304 (little endian: 04 03 02 01) + auto raw_bytes_ptr = static_cast( + const_cast(static_cast(raw_bytes.data()))); + raw_bytes_ptr[0] = 0x04; + raw_bytes_ptr[1] = 0x03; + raw_bytes_ptr[2] = 0x02; + raw_bytes_ptr[3] = 0x01; + + // Write raw bytes through void access (triggers compression) + TENSORSTORE_EXPECT_OK(tensorstore::Write(raw_bytes, void_store).result()); + + // Read back through normal typed access + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto typed_store, + tensorstore::Open(create_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto typed_read, + tensorstore::Read(typed_store).result()); + auto typed_ptr = static_cast(typed_read.data()); + + // First element should be 0x01020304 + EXPECT_EQ(typed_ptr[0], 0x01020304); + // Rest should be zeros + EXPECT_EQ(typed_ptr[1], 0); +} + +// Tests for GetSpecInfo() with open_as_void + +TEST(ZarrDriverTest, GetSpecInfoOpenAsVoidWithKnownRank) { + // Test that GetSpecInfo correctly computes rank when open_as_void=true + // and dtype is specified with known chunked_rank. + // Expected: full_rank = chunked_rank + 1 (for bytes dimension) + ::nlohmann::json json_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"dtype", "'; } +const ZarrDType::Field* ZarrDType::GetVoidField() const { + if (!void_field_cache_) { + const Index nbytes = bytes_per_outer_element; + void_field_cache_ = Field{ + {/*encoded_dtype=*/tensorstore::StrCat("|V", nbytes), + /*dtype=*/dtype_v<::tensorstore::dtypes::byte_t>, + /*endian=*/endian::native, + /*flexible_shape=*/{}}, + /*outer_shape=*/{}, + /*name=*/{}, + /*field_shape=*/{nbytes}, + /*num_inner_elements=*/nbytes, + /*byte_offset=*/0, + /*num_bytes=*/nbytes}; + } + return &*void_field_cache_; +} + Result ChooseBaseDType(DataType dtype) { ZarrDType::BaseDType base_dtype; base_dtype.endian = endian::native; diff --git a/tensorstore/driver/zarr/dtype.h b/tensorstore/driver/zarr/dtype.h index be858d671..1ae652f9b 100644 --- a/tensorstore/driver/zarr/dtype.h +++ b/tensorstore/driver/zarr/dtype.h @@ -114,11 +114,18 @@ struct ZarrDType { /// Bytes per "outer" element (derived value). Index bytes_per_outer_element; + /// Returns a synthesized field for raw byte access to the entire dtype. + /// The returned pointer is valid for the lifetime of this ZarrDType. + const Field* GetVoidField() const; + TENSORSTORE_DECLARE_JSON_DEFAULT_BINDER(ZarrDType, internal_json_binding::NoOptions) friend void to_json(::nlohmann::json& out, // NOLINT const ZarrDType& dtype); + + /// Lazily-computed cache for GetVoidField(). + mutable std::optional void_field_cache_; }; /// Parses a zarr metadata "dtype" JSON specification. diff --git a/tensorstore/driver/zarr/metadata.cc b/tensorstore/driver/zarr/metadata.cc index 75bef0676..77ca20ce1 100644 --- a/tensorstore/driver/zarr/metadata.cc +++ b/tensorstore/driver/zarr/metadata.cc @@ -366,6 +366,28 @@ Result ComputeChunkLayout( return layout; } +Result CreateVoidMetadata(const ZarrMetadata& original) { + auto metadata = std::make_shared(original); + + // Replace dtype with void dtype (single void field) + const auto* void_field = original.dtype.GetVoidField(); + metadata->dtype.has_fields = false; + metadata->dtype.fields = {*void_field}; + // bytes_per_outer_element stays the same (inherited from copy) + + // Set fill_value for the single void field. + // Empty/null fill value is handled by GetChunkGridSpecification. + metadata->fill_value.resize(1); + + // Recompute chunk_layout using existing ValidateMetadata. + // ComputeChunkLayout handles the void field correctly because + // void_field.num_bytes == bytes_per_outer_element, producing + // matching encoded/decoded layouts as required by DecodeChunk. + TENSORSTORE_RETURN_IF_ERROR(ValidateMetadata(*metadata)); + + return metadata; +} + constexpr auto MetadataJsonBinder = [](auto maybe_optional) { return [=](auto is_loading, const auto& options, auto* obj, auto* j) { using T = absl::remove_cvref_t; diff --git a/tensorstore/driver/zarr/metadata.h b/tensorstore/driver/zarr/metadata.h index 3a643e407..25afe531b 100644 --- a/tensorstore/driver/zarr/metadata.h +++ b/tensorstore/driver/zarr/metadata.h @@ -199,6 +199,16 @@ Result ComputeChunkLayout( const ZarrDType& dtype, ContiguousLayoutOrder order, tensorstore::span chunk_shape); +/// Creates a modified ZarrMetadata for void (raw byte) access. +/// +/// The returned metadata has dtype.fields containing only the void field +/// (from GetVoidField()), allowing standard encode/decode paths to work +/// with raw bytes. The chunk_layout is recomputed accordingly. +/// +/// \param original The original metadata to base the void metadata on. +/// \returns A new metadata suitable for void access. +Result CreateVoidMetadata(const ZarrMetadata& original); + /// Encodes the field fill values as a zarr metadata "fill_value" JSON /// specification. /// diff --git a/tensorstore/driver/zarr/metadata_test.cc b/tensorstore/driver/zarr/metadata_test.cc index 8778ce03d..11b64a874 100644 --- a/tensorstore/driver/zarr/metadata_test.cc +++ b/tensorstore/driver/zarr/metadata_test.cc @@ -804,4 +804,92 @@ TEST(DimensionSeparatorTest, JsonBinderTestInvalid) { DimensionSeparatorJsonBinder); } +using ::tensorstore::internal_zarr::CreateVoidMetadata; + +TEST(CreateVoidMetadataTest, SimpleType) { + // Create metadata for a simple int32 type + std::string_view metadata_text = R"({ + "chunks": [10, 10], + "compressor": null, + "dtype": "rank, original.rank); + EXPECT_EQ(void_metadata->shape, original.shape); + EXPECT_EQ(void_metadata->chunks, original.chunks); + EXPECT_EQ(void_metadata->order, original.order); + EXPECT_EQ(void_metadata->zarr_format, original.zarr_format); + + // Verify dtype is converted to void type + EXPECT_FALSE(void_metadata->dtype.has_fields); + EXPECT_EQ(1, void_metadata->dtype.fields.size()); + EXPECT_EQ(dtype_v, void_metadata->dtype.fields[0].dtype); + EXPECT_EQ(original.dtype.bytes_per_outer_element, + void_metadata->dtype.bytes_per_outer_element); + + // Verify field_shape is {bytes_per_outer_element} + EXPECT_THAT(void_metadata->dtype.fields[0].field_shape, + ElementsAre(original.dtype.bytes_per_outer_element)); + + // Verify chunk_layout is computed correctly + EXPECT_EQ(void_metadata->chunk_layout.num_outer_elements, + original.chunk_layout.num_outer_elements); + EXPECT_EQ(void_metadata->chunk_layout.bytes_per_chunk, + original.chunk_layout.bytes_per_chunk); + EXPECT_EQ(1, void_metadata->chunk_layout.fields.size()); + + // Verify encoded and decoded layouts are equal (required for single-field + // optimized decode path) + EXPECT_EQ(void_metadata->chunk_layout.fields[0].encoded_chunk_layout, + void_metadata->chunk_layout.fields[0].decoded_chunk_layout); +} + +TEST(CreateVoidMetadataTest, StructuredType) { + // Create metadata for a structured dtype with multiple fields + std::string_view metadata_text = R"({ + "chunks": [10], + "compressor": {"id": "zlib", "level": 1}, + "dtype": [["a", "dtype.has_fields); + EXPECT_EQ(1, void_metadata->dtype.fields.size()); + EXPECT_EQ(dtype_v, void_metadata->dtype.fields[0].dtype); + + // bytes_per_outer_element should be 4 + 10 = 14 + EXPECT_EQ(14, original.dtype.bytes_per_outer_element); + EXPECT_EQ(14, void_metadata->dtype.bytes_per_outer_element); + EXPECT_THAT(void_metadata->dtype.fields[0].field_shape, ElementsAre(14)); + + // Verify compressor is preserved + EXPECT_EQ(void_metadata->compressor, original.compressor); + + // Verify order is preserved + EXPECT_EQ(void_metadata->order, original.order); +} + } // namespace diff --git a/tensorstore/driver/zarr/schema.yml b/tensorstore/driver/zarr/schema.yml index 45711648c..91f45eb98 100644 --- a/tensorstore/driver/zarr/schema.yml +++ b/tensorstore/driver/zarr/schema.yml @@ -15,8 +15,18 @@ allOf: title: Name of field to open. description: | Must be specified if the `.metadata.dtype` specified in the array - metadata has more than one field. + metadata has more than one field. Cannot be specified together with + :json:`"open_as_void": true`. default: null + open_as_void: + type: boolean + title: Raw byte access mode. + description: | + When true, opens the array as raw bytes instead of interpreting it + as structured data. The resulting array will have an additional + dimension representing the byte layout of each element. Cannot be + :json:`true` if ``field`` is also specified. + default: false metadata: title: Zarr array metadata. description: | diff --git a/tensorstore/driver/zarr/spec.cc b/tensorstore/driver/zarr/spec.cc index 34a2825f9..f374db197 100644 --- a/tensorstore/driver/zarr/spec.cc +++ b/tensorstore/driver/zarr/spec.cc @@ -151,7 +151,8 @@ absl::Status ValidateMetadata(const ZarrMetadata& metadata, Result GetNewMetadata( const ZarrPartialMetadata& partial_metadata, - const SelectedField& selected_field, const Schema& schema) { + const SelectedField& selected_field, const Schema& schema, + bool open_as_void) { ZarrMetadataPtr metadata = std::make_shared(); metadata->zarr_format = partial_metadata.zarr_format.value_or(2); metadata->dimension_separator = partial_metadata.dimension_separator.value_or( @@ -167,12 +168,17 @@ Result GetNewMetadata( // before validating the domain. size_t selected_field_index = 0; + const ZarrDType::Field* field_ptr = nullptr; if (partial_metadata.dtype) { - // If a zarr dtype is specified explicitly, determine the field index. If a - // multi-field zarr dtype is desired, it must be specified explicitly. - TENSORSTORE_ASSIGN_OR_RETURN( - selected_field_index, - GetFieldIndex(*partial_metadata.dtype, selected_field)); + if (open_as_void) { + field_ptr = partial_metadata.dtype->GetVoidField(); + } else { + TENSORSTORE_ASSIGN_OR_RETURN( + size_t field_index, + GetFieldIndex(*partial_metadata.dtype, selected_field)); + field_ptr = &partial_metadata.dtype->fields[field_index]; + selected_field_index = field_index; + } metadata->dtype = *partial_metadata.dtype; } else { if (!selected_field.empty()) { @@ -180,6 +186,11 @@ Result GetNewMetadata( "\"dtype\" must be specified in \"metadata\" if \"field\" is " "specified"); } + if (open_as_void) { + return absl::InvalidArgumentError( + "\"dtype\" must be specified in \"metadata\" if \"open_as_void\" is " + "specified"); + } if (!schema.dtype().valid()) { return absl::InvalidArgumentError("\"dtype\" must be specified"); } @@ -191,8 +202,9 @@ Result GetNewMetadata( static_cast(field), internal_zarr::ChooseBaseDType(schema.dtype())); TENSORSTORE_RETURN_IF_ERROR(ValidateDType(metadata->dtype)); + field_ptr = &field; } - auto& field = metadata->dtype.fields[selected_field_index]; + auto& field = *field_ptr; SpecRankAndFieldInfo info; info.full_rank = schema.rank(); @@ -333,15 +345,27 @@ absl::Status ValidateSpecRankAndFieldInfo(SpecRankAndFieldInfo& info) { Result GetSpecRankAndFieldInfo( const ZarrPartialMetadata& metadata, const SelectedField& selected_field, const Schema& schema) { + return GetSpecRankAndFieldInfo(metadata, selected_field, schema, + /*open_as_void=*/false); +} + +Result GetSpecRankAndFieldInfo( + const ZarrPartialMetadata& metadata, const SelectedField& selected_field, + const Schema& schema, bool open_as_void) { SpecRankAndFieldInfo info; info.full_rank = schema.rank(); info.chunked_rank = metadata.rank; if (metadata.dtype) { - TENSORSTORE_ASSIGN_OR_RETURN( - size_t field_index, GetFieldIndex(*metadata.dtype, selected_field)); - info.field = &metadata.dtype->fields[field_index]; + if (open_as_void) { + info.field = metadata.dtype->GetVoidField(); + } else { + TENSORSTORE_ASSIGN_OR_RETURN( + size_t field_index, + GetFieldIndex(*metadata.dtype, selected_field)); + info.field = &metadata.dtype->fields[field_index]; + } } TENSORSTORE_RETURN_IF_ERROR(ValidateSpecRankAndFieldInfo(info)); diff --git a/tensorstore/driver/zarr/spec.h b/tensorstore/driver/zarr/spec.h index 0ef3ab9d3..92436755c 100644 --- a/tensorstore/driver/zarr/spec.h +++ b/tensorstore/driver/zarr/spec.h @@ -70,9 +70,11 @@ using SelectedField = std::string; /// \param partial_metadata Constraints in the form of partial zarr metadata. /// \param selected_field The field to which `schema` applies. /// \param schema Schema constraints for the `selected_field`. +/// \param open_as_void If true, opens the array as raw bytes. Result GetNewMetadata( const ZarrPartialMetadata& partial_metadata, - const SelectedField& selected_field, const Schema& schema); + const SelectedField& selected_field, const Schema& schema, + bool open_as_void); struct SpecRankAndFieldInfo { /// Full rank of the TensorStore, if known. Equal to the chunked rank plus @@ -95,6 +97,14 @@ Result GetSpecRankAndFieldInfo( const ZarrPartialMetadata& metadata, const SelectedField& selected_field, const Schema& schema); +/// Overload that supports open_as_void mode. +/// +/// When `open_as_void` is true and `metadata.dtype` is specified, `info.field` +/// points to the dtype's synthesized void field. +Result GetSpecRankAndFieldInfo( + const ZarrPartialMetadata& metadata, const SelectedField& selected_field, + const Schema& schema, bool open_as_void); + SpecRankAndFieldInfo GetSpecRankAndFieldInfo(const ZarrMetadata& metadata, size_t field_index); diff --git a/tensorstore/driver/zarr/spec_test.cc b/tensorstore/driver/zarr/spec_test.cc index 6cd358dd5..660e12429 100644 --- a/tensorstore/driver/zarr/spec_test.cc +++ b/tensorstore/driver/zarr/spec_test.cc @@ -252,7 +252,7 @@ tensorstore::Result<::nlohmann::json> GetNewMetadataFromOptions( ZarrPartialMetadata::FromJson(partial_metadata_json)); TENSORSTORE_ASSIGN_OR_RETURN( auto new_metadata, - GetNewMetadata(partial_metadata, selected_field, schema)); + GetNewMetadata(partial_metadata, selected_field, schema, false)); return new_metadata->ToJson(); } @@ -638,6 +638,24 @@ TEST(GetNewMetadataTest, SelectedFieldDtypeNotSpecified) { "\"field\" is specified"))); } +TEST(GetNewMetadataTest, OpenAsVoidDtypeNotSpecified) { + // When open_as_void=true, dtype must be specified in metadata because + // open_as_void is for accessing existing structured data as raw bytes. + Schema schema; + TENSORSTORE_ASSERT_OK(schema.Set(Schema::Shape({100, 200}))); + TENSORSTORE_ASSERT_OK(schema.Set(dtype_v)); + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto partial_metadata, + ZarrPartialMetadata::FromJson(::nlohmann::json::object_t())); + EXPECT_THAT( + tensorstore::internal_zarr::GetNewMetadata(partial_metadata, + /*selected_field=*/{}, schema, + /*open_as_void=*/true), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("\"dtype\" must be specified in \"metadata\" if " + "\"open_as_void\" is specified"))); +} + TEST(GetNewMetadataTest, SelectedFieldInvalid) { EXPECT_THAT( GetNewMetadataFromOptions({{"dtype", {{"x", "