From a368332af9c38d39c59018a9d1d4c5728c62f69c Mon Sep 17 00:00:00 2001 From: LTLA Date: Tue, 17 Dec 2024 14:07:00 -0800 Subject: [PATCH 1/3] Simplify DelayedArrays by collapsing repeated operations. This reduces the number of operations in each DelayedArray object without changing the result, which makes the DA easier to interpret and parse. --- src/delayedarray/Combine.py | 21 +++++++++++++++- src/delayedarray/DelayedArray.py | 23 +++++++++++------ src/delayedarray/Subset.py | 34 ++++++++++++++++++++------ src/delayedarray/Transpose.py | 32 ++++++++++++++++++++---- tests/test_Combine.py | 42 ++++++++++++++++++++++++++++++++ tests/test_Subset.py | 31 +++++++++++++++++++++++ tests/test_Transpose.py | 23 +++++++++++++++++ 7 files changed, 185 insertions(+), 21 deletions(-) diff --git a/src/delayedarray/Combine.py b/src/delayedarray/Combine.py index 2a56fda..1f77e95 100644 --- a/src/delayedarray/Combine.py +++ b/src/delayedarray/Combine.py @@ -1,5 +1,6 @@ -from typing import Callable, Tuple, Sequence +from typing import Callable, Tuple, Sequence, Any import numpy +import copy from .DelayedOp import DelayedOp from ._mask import _concatenate_unmasked_ndarrays, _concatenate_maybe_masked_ndarrays @@ -101,6 +102,24 @@ def along(self) -> int: return self._along +def _simplify_combine(x: Combine) -> Any: + all_seeds = [] + for ss in x.seeds: + if type(ss) is Combine and x.along == ss.along: + # Don't use isinstance, we don't want to collapse for Combine + # subclasses that might be doing god knows what. + all_seeds += ss.seeds + else: + all_seeds.append(ss) + if len(all_seeds) == 1: + return all_seeds[0] + if len(all_seeds) == len(x.seeds): + return x + new_x = copy.copy(x) + new_x._seeds = all_seeds + return new_x + + def _extract_subarrays(x: Combine, subset: Tuple[Sequence[int], ...], f: Callable): # Figuring out which slices belong to who. chosen = subset[x._along] diff --git a/src/delayedarray/DelayedArray.py b/src/delayedarray/DelayedArray.py index 9edc4cc..c8e3423 100644 --- a/src/delayedarray/DelayedArray.py +++ b/src/delayedarray/DelayedArray.py @@ -6,10 +6,10 @@ from .SparseNdarray import SparseNdarray from .BinaryIsometricOp import BinaryIsometricOp from .Cast import Cast -from .Combine import Combine +from .Combine import Combine, _simplify_combine from .Round import Round -from .Subset import Subset -from .Transpose import Transpose +from .Subset import Subset, _simplify_subset +from .Transpose import Transpose, _simplify_transpose from .UnaryIsometricOpSimple import UnaryIsometricOpSimple from .UnaryIsometricOpWithArgs import UnaryIsometricOpWithArgs @@ -136,7 +136,9 @@ def T(self) -> "DelayedArray": Returns: A ``DelayedArray`` containing the delayed transpose. """ - return DelayedArray(Transpose(self._seed, perm=None)) + tout = Transpose(self._seed, perm=None) + tout = _simplify_transpose(tout) + return DelayedArray(tout) def __repr__(self) -> str: """Pretty-print this ``DelayedArray``. This uses @@ -253,12 +255,13 @@ def __array_function__(self, func, types, args, kwargs) -> "DelayedArray": seeds = [] for x in args[0]: seeds.append(_extract_seed(x)) - if "axis" in kwargs: axis = kwargs["axis"] else: axis = 0 - return DelayedArray(Combine(seeds, along=axis)) + cout = Combine(seeds, along=axis) + cout = _simplify_combine(cout) + return DelayedArray(cout) if func == numpy.transpose: seed = _extract_seed(args[0]) @@ -266,7 +269,9 @@ def __array_function__(self, func, types, args, kwargs) -> "DelayedArray": axes = kwargs["axes"] else: axes = None - return DelayedArray(Transpose(seed, perm=axes)) + tout = Transpose(seed, perm=axes) + tout = _simplify_transpose(tout) + return DelayedArray(tout) if func == numpy.round: seed = _extract_seed(args[0]) @@ -808,7 +813,9 @@ def __getitem__(self, subset: Tuple[Union[slice, Sequence], ...]) -> Union["Dela """ cleaned = _getitem_subset_preserves_dimensions(self.shape, subset) if cleaned is not None: - return DelayedArray(Subset(self._seed, cleaned)) + sout = Subset(self._seed, cleaned) + sout = _simplify_subset(sout) + return DelayedArray(sout) return _getitem_subset_discards_dimensions(self._seed, subset, extract_dense_array) diff --git a/src/delayedarray/Subset.py b/src/delayedarray/Subset.py index bd22dd9..2e50c7c 100644 --- a/src/delayedarray/Subset.py +++ b/src/delayedarray/Subset.py @@ -1,10 +1,12 @@ -from typing import Callable, Sequence, Tuple +from typing import Callable, Sequence, Tuple, Any from numpy import dtype, ndarray, ix_ import numpy +import biocutils +import copy from .DelayedOp import DelayedOp from .SparseNdarray import SparseNdarray -from ._subset import _sanitize_subset +from ._subset import _sanitize_subset, _is_single_subset_noop from .extract_dense_array import extract_dense_array from .extract_sparse_array import extract_sparse_array from .create_dask_array import create_dask_array @@ -87,6 +89,28 @@ def subset(self) -> Tuple[Sequence[int], ...]: return self._subset +def _simplify_subset(x: Subset) -> Any: + seed = x.seed + if not type(seed) is Subset: + # Don't use isinstance, we don't want to collapse for Subset + # subclasses that might be doing god knows what. + return x + all_subsets = [] + noop = True + for i, sub in enumerate(x.subset): + seed_sub = seed.subset[i] + new_sub = biocutils.subset_sequence(seed_sub, sub) + if noop and not _is_single_subset_noop(seed.seed.shape[i], new_sub): + noop = False + all_subsets.append(new_sub) + if noop: + return seed.seed + new_x = copy.copy(x) + new_x._seed = seed.seed + new_x._subset = (*all_subsets,) + return new_x + + def _extract_array(x: Subset, subset: Tuple[Sequence[int], ...], f: Callable): newsub = list(subset) expanded = [] @@ -94,11 +118,7 @@ def _extract_array(x: Subset, subset: Tuple[Sequence[int], ...], f: Callable): for i, s in enumerate(newsub): cursub = x._subset[i] - if isinstance(cursub, ndarray): - replacement = cursub[s] - else: - replacement = [cursub[j] for j in s] - + replacement = biocutils.subset_sequence(cursub, s) san_sub, san_remap = _sanitize_subset(replacement) newsub[i] = san_sub diff --git a/src/delayedarray/Transpose.py b/src/delayedarray/Transpose.py index 3432c18..40a4fb9 100644 --- a/src/delayedarray/Transpose.py +++ b/src/delayedarray/Transpose.py @@ -1,6 +1,7 @@ -from typing import Callable, Optional, Tuple, Sequence +from typing import Callable, Optional, Tuple, Sequence, Any from numpy import dtype, transpose import numpy +import copy from .DelayedOp import DelayedOp from .SparseNdarray import SparseNdarray @@ -40,8 +41,6 @@ def __init__(self, seed, perm: Optional[Tuple[int, ...]]): dimension ordering is assumed to be reversed. """ - self._seed = seed - curshape = seed.shape ndim = len(curshape) if perm is not None: @@ -52,12 +51,12 @@ def __init__(self, seed, perm: Optional[Tuple[int, ...]]): else: perm = (*range(ndim - 1, -1, -1),) - self._perm = perm - final_shape = [] for x in perm: final_shape.append(curshape[x]) + self._seed = seed + self._perm = perm self._shape = (*final_shape,) @property @@ -94,6 +93,29 @@ def perm(self) -> Tuple[int, ...]: return self._perm +def _simplify_transpose(x: Transpose) -> Any: + seed = x.seed + if not type(seed) is Transpose: + # Don't use isinstance, we don't want to collapse for Transpose + # subclasses that might be doing god knows what. + return x + + new_perm = [] + noop = True + for i, p in enumerate(x.perm): + new_p = seed.perm[p] + if new_p != i: + noop = False + new_perm.append(new_p) + if noop: + return seed.seed + + new_x = copy.copy(x) + new_x._seed = seed.seed + new_x._perm = (*new_perm,) + return new_x + + def _extract_array(x: Transpose, subset: Tuple[Sequence[int], ...], f: Callable): permsub = [None] * len(subset) for i, j in enumerate(x._perm): diff --git a/tests/test_Combine.py b/tests/test_Combine.py index 1244729..dfaf28e 100644 --- a/tests/test_Combine.py +++ b/tests/test_Combine.py @@ -43,6 +43,48 @@ def test_Combine_otherdim(left_mask_rate, right_mask_rate): assert_identical_ndarrays(delayedarray.to_dense_array(x), safe_concatenate((y1, y2), axis=1)) +def test_Combine_simplified(): + y1 = simulate_ndarray((30, 23), mask_rate=0) + y2 = simulate_ndarray((50, 23), mask_rate=0) + y3 = simulate_ndarray((30, 41), mask_rate=0) + + x1 = delayedarray.DelayedArray(y1) + x2 = delayedarray.DelayedArray(y2) + x3 = delayedarray.DelayedArray(y3) + + com = numpy.concatenate((x1, x2)) + com2 = numpy.concatenate((com, x2)) + assert isinstance(com2, delayedarray.DelayedArray) + assert isinstance(com2.seed, delayedarray.Combine) + assert len(com2.seed.seeds) == 3 + assert [isinstance(s, delayedarray.Combine) for s in com2.seed.seeds] == [False] * 3 + assert_identical_ndarrays(delayedarray.to_dense_array(com2), safe_concatenate((y1, y2, y2))) + + com = numpy.concatenate((x1, x3), axis=1) + com2 = numpy.concatenate((com, x1), axis=1) + assert isinstance(com2, delayedarray.DelayedArray) + assert isinstance(com2.seed, delayedarray.Combine) + assert len(com2.seed.seeds) == 3 + assert [isinstance(s, delayedarray.Combine) for s in com2.seed.seeds] == [False] * 3 + assert_identical_ndarrays(delayedarray.to_dense_array(com2), safe_concatenate((y1, y3, y1), axis=1)) + + # No-ops properly. + com = numpy.concatenate((x1,)) + assert isinstance(com, delayedarray.DelayedArray) + assert isinstance(com.seed, numpy.ndarray) + assert_identical_ndarrays(delayedarray.to_dense_array(com), y1) + + # Doesn't attempt to collapse if the axes are different. + com = numpy.concatenate((x1, x2)) + com2 = numpy.concatenate((com, com), axis=1) + assert isinstance(com2, delayedarray.DelayedArray) + assert isinstance(com2.seed, delayedarray.Combine) + assert len(com2.seed.seeds) == 2 + assert [isinstance(s, delayedarray.Combine) for s in com2.seed.seeds] == [True] * 2 + ref = numpy.concatenate((y1, y2)) + assert_identical_ndarrays(delayedarray.to_dense_array(com2), safe_concatenate((ref, ref), axis=1)) + + @pytest.mark.parametrize("left_mask_rate", [0, 0.2]) @pytest.mark.parametrize("right_mask_rate", [0, 0.2]) def test_Combine_subset(left_mask_rate, right_mask_rate): diff --git a/tests/test_Subset.py b/tests/test_Subset.py index 7c0b53d..c8e471a 100644 --- a/tests/test_Subset.py +++ b/tests/test_Subset.py @@ -1,6 +1,7 @@ import delayedarray import numpy import pytest +import biocutils from utils import simulate_ndarray, assert_identical_ndarrays, simulate_SparseNdarray @@ -13,6 +14,9 @@ def test_Subset_ix(mask_rate): subix = numpy.ix_(range(1, 10), [20, 30, 40], [10, 11, 12, 13]) sub = x[subix] + assert isinstance(sub, delayedarray.DelayedArray) + assert isinstance(sub.seed, delayedarray.Subset) + assert sub.shape == (9, 3, 4) assert isinstance(sub.seed.seed, numpy.ndarray) assert len(sub.seed.subset) == 3 @@ -88,6 +92,33 @@ def test_Subset_unsorted_duplicates(mask_rate): assert_identical_ndarrays(delayedarray.to_dense_array(sub), y[:, [5, 4, 3, 2, 1, 0], :]) +def test_Subset_simplified(): + test_shape = (30, 55) + y = simulate_ndarray(test_shape, mask_rate=0) + x = delayedarray.DelayedArray(y) + + sub = x[:, list(range(0, 55, 2))] + sub2 = sub[:, list(range(5, 20))] + assert isinstance(sub2, delayedarray.DelayedArray) + assert isinstance(sub2.seed, delayedarray.Subset) + assert isinstance(sub2.seed.seed, numpy.ndarray) + assert_identical_ndarrays(delayedarray.to_dense_array(sub2), y[:, biocutils.subset_sequence(range(0, 55, 2), range(5, 20))]) + + sub = x[list(range(10, 20)), :] + sub2 = sub[:, list(range(0, 55, 5))] + assert isinstance(sub2, delayedarray.DelayedArray) + assert isinstance(sub2.seed, delayedarray.Subset) + assert isinstance(sub2.seed.seed, numpy.ndarray) + assert_identical_ndarrays(delayedarray.to_dense_array(sub2), y[10:20,0:55:5]) + + # Identifies no-ops and returns the seed directly. + sub = x[::-1,::-1] + sub2 = sub[::-1,::-1] + assert isinstance(sub2, delayedarray.DelayedArray) + assert isinstance(sub2.seed, numpy.ndarray) + assert_identical_ndarrays(delayedarray.to_dense_array(sub2), y) + + @pytest.mark.parametrize("mask_rate", [0, 0.2]) def test_Subset_subset(mask_rate): y = simulate_ndarray((99, 63), mask_rate=mask_rate) diff --git a/tests/test_Transpose.py b/tests/test_Transpose.py index 7da912d..2716aae 100644 --- a/tests/test_Transpose.py +++ b/tests/test_Transpose.py @@ -25,6 +25,29 @@ def test_Transpose_simple(mask_rate): assert_identical_ndarrays(delayedarray.to_dense_array(t), numpy.transpose(y)) +def test_Transpose_simplified(): + y = simulate_ndarray((30, 23, 5), mask_rate=0) + x = delayedarray.DelayedArray(y) + + t = x.T + t2 = t.T + assert isinstance(t2, delayedarray.DelayedArray) + assert isinstance(t2.seed, numpy.ndarray) + assert_identical_ndarrays(delayedarray.to_dense_array(t2), y.T.T) + + t2 = numpy.transpose(t, axes=(2, 1, 0)) + assert isinstance(t2, delayedarray.DelayedArray) + assert isinstance(t2.seed, numpy.ndarray) + assert_identical_ndarrays(delayedarray.to_dense_array(t2), numpy.transpose(y.T, (2, 1, 0))) + + t2 = numpy.transpose(t, axes=(1, 2, 0)) + assert isinstance(t2, delayedarray.DelayedArray) + assert isinstance(t2.seed, delayedarray.Transpose) + assert t2.seed.perm == (1, 0, 2) + assert isinstance(t2.seed.seed, numpy.ndarray) + assert_identical_ndarrays(delayedarray.to_dense_array(t2), numpy.transpose(y.T, axes=(1, 2, 0))) + + @pytest.mark.parametrize("mask_rate", [0, 0.2]) def test_Transpose_more_dimensions(mask_rate): y = simulate_ndarray((30, 23, 10), mask_rate=mask_rate) From a0dffdee9f832efb942f807b6678aa71261d66c9 Mon Sep 17 00:00:00 2001 From: LTLA Date: Tue, 17 Dec 2024 14:30:56 -0800 Subject: [PATCH 2/3] Streamlined the Combine simplification. --- src/delayedarray/Combine.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/delayedarray/Combine.py b/src/delayedarray/Combine.py index 1f77e95..92651e3 100644 --- a/src/delayedarray/Combine.py +++ b/src/delayedarray/Combine.py @@ -103,17 +103,19 @@ def along(self) -> int: def _simplify_combine(x: Combine) -> Any: + if len(x.seeds) == 1: + return x.seeds[0] all_seeds = [] + simplified = False for ss in x.seeds: if type(ss) is Combine and x.along == ss.along: # Don't use isinstance, we don't want to collapse for Combine # subclasses that might be doing god knows what. all_seeds += ss.seeds + simplified = True else: all_seeds.append(ss) - if len(all_seeds) == 1: - return all_seeds[0] - if len(all_seeds) == len(x.seeds): + if not simplified: return x new_x = copy.copy(x) new_x._seeds = all_seeds From 4d7739b2a4e795bc42fe30a2e75fa621d94e486e Mon Sep 17 00:00:00 2001 From: LTLA Date: Tue, 17 Dec 2024 14:54:19 -0800 Subject: [PATCH 3/3] Mention that the latest version of BiocUtils is required. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index b8f4945..47de245 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,7 +50,7 @@ python_requires = >=3.8 install_requires = importlib-metadata; python_version<"3.8" numpy - biocutils + biocutils>=0.1.8 [options.packages.find]