From c0ca924544225f411ecaf5b5d8887e4f636cef54 Mon Sep 17 00:00:00 2001 From: Justin Davis Date: Thu, 18 Dec 2025 14:32:34 -0800 Subject: [PATCH 1/3] Add CV-CUDA support to ToDtype transform --- test/test_transforms_v2.py | 96 ++++++++++++++-- torchvision/transforms/v2/_misc.py | 13 ++- torchvision/transforms/v2/functional/_misc.py | 104 +++++++++++++++++- 3 files changed, 198 insertions(+), 15 deletions(-) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index b9f440bd545..c46fe2cbfe3 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -21,6 +21,7 @@ import torchvision.transforms.v2 as transforms from common_utils import ( + assert_close, assert_equal, cache, cpu_and_cuda, @@ -42,7 +43,6 @@ ) from torch import nn -from torch.testing import assert_close from torch.utils._pytree import tree_flatten, tree_map from torch.utils.data import DataLoader, default_collate from torchvision import tv_tensors @@ -2619,7 +2619,32 @@ def test_kernel(self, kernel, make_input, input_dtype, output_dtype, device, sca scale=scale, ) - @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.to_dtype_image, torch.Tensor), + (F.to_dtype_video, tv_tensors.Video), + pytest.param( + F._misc._to_dtype_image_cvcuda, + None, + marks=pytest.mark.needs_cvcuda, + ), + ], + ) + def test_functional_signature(self, kernel, input_type): + if kernel is F._misc._to_dtype_image_cvcuda: + input_type = _import_cvcuda().Tensor + check_functional_kernel_signature_match(F.to_dtype, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image, + make_video, + pytest.param(make_image_cvcuda, marks=pytest.mark.needs_cvcuda), + ], + ) @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) @pytest.mark.parametrize("device", cpu_and_cuda()) @@ -2634,7 +2659,14 @@ def test_functional(self, make_input, input_dtype, output_dtype, device, scale): @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + [ + make_image_tensor, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + pytest.param(make_image_cvcuda, marks=pytest.mark.needs_cvcuda), + ], ) @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) @@ -2680,25 +2712,69 @@ def fn(value): return torch.tensor(tree_map(fn, image.tolist())).to(dtype=output_dtype, device=image.device) + def _get_dtype_conversion_atol_cvcuda(self, input_dtype, output_dtype): + in_bits = torch.iinfo(input_dtype).bits if not input_dtype.is_floating_point else None + out_bits = torch.iinfo(output_dtype).bits if not output_dtype.is_floating_point else None + narrows_bits = in_bits is not None and out_bits is not None and out_bits < in_bits + + # int->int with narrowing bits, allow atol=1 for rounding diffs + if narrows_bits: + atol = 1 + # float->int check for same diff, rounding error on float + elif input_dtype.is_floating_point and not output_dtype.is_floating_point: + atol = 1 + # if generating a float value from an int, allow small rounding error + elif not input_dtype.is_floating_point and output_dtype.is_floating_point: + atol = 1e-7 + # all other cases, should be exact + # uint8 -> uint16 promotion would be here + else: + atol = 0 + + return atol + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8, torch.uint16]) @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8, torch.uint16]) @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("scale", (True, False)) - def test_image_correctness(self, input_dtype, output_dtype, device, scale): + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param(make_image_cvcuda, marks=pytest.mark.needs_cvcuda), + ], + ) + @pytest.mark.parametrize("fn", [F.to_dtype, transform_cls_to_functional(transforms.ToDtype)]) + def test_image_correctness(self, input_dtype, output_dtype, device, scale, make_input, fn): if input_dtype.is_floating_point and output_dtype == torch.int64: pytest.xfail("float to int64 conversion is not supported") if input_dtype == torch.uint8 and output_dtype == torch.uint16 and device == "cuda": pytest.xfail("uint8 to uint16 conversion is not supported on cuda") + if ( + input_dtype == torch.uint16 + and output_dtype == torch.uint8 + and not scale + and make_input is make_image_cvcuda + ): + pytest.xfail("uint16 to uint8 conversion without scale is not supported for CV-CUDA.") - input = make_image(dtype=input_dtype, device=device) + input = make_input(dtype=input_dtype, device=device) + out = fn(input, dtype=output_dtype, scale=scale) + + if make_input is make_image_cvcuda: + input = F.cvcuda_to_tensor(input) + out = F.cvcuda_to_tensor(out) - out = F.to_dtype(input, dtype=output_dtype, scale=scale) expected = self.reference_convert_dtype_image_tensor(input, dtype=output_dtype, scale=scale) - if input_dtype.is_floating_point and not output_dtype.is_floating_point and scale: - torch.testing.assert_close(out, expected, atol=1, rtol=0) - else: - torch.testing.assert_close(out, expected) + atol, rtol = None, None + if make_input is make_image_cvcuda: + atol = self._get_dtype_conversion_atol_cvcuda(input_dtype, output_dtype) + rtol = 0 + elif input_dtype.is_floating_point and not output_dtype.is_floating_point and scale: + atol, rtol = 1, 0 + + torch.testing.assert_close(out, expected, atol=atol, rtol=rtol) def was_scaled(self, inpt): # this assumes the target dtype is float diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 305149c87b1..26749c855a4 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -9,6 +9,7 @@ from torchvision import transforms as _transforms, tv_tensors from torchvision.transforms.v2 import functional as F, Transform +from torchvision.transforms.v2.functional._utils import _is_cvcuda_tensor from ._utils import ( _parse_labels_getter, @@ -267,7 +268,7 @@ class ToDtype(Transform): Default: ``False``. """ - _transformed_types = (torch.Tensor,) + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) def __init__( self, dtype: Union[torch.dtype, dict[Union[type, str], Optional[torch.dtype]]], scale: bool = False @@ -294,7 +295,11 @@ def transform(self, inpt: Any, params: dict[str, Any]) -> Any: if isinstance(self.dtype, torch.dtype): # For consistency / BC with ConvertImageDtype, we only care about images or videos when dtype # is a simple torch.dtype - if not is_pure_tensor(inpt) and not isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)): + if ( + not is_pure_tensor(inpt) + and not isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)) + and not _is_cvcuda_tensor(inpt) + ): return inpt dtype: Optional[torch.dtype] = self.dtype @@ -311,7 +316,9 @@ def transform(self, inpt: Any, params: dict[str, Any]) -> Any: 'e.g. dtype={tv_tensors.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.' ) - supports_scaling = is_pure_tensor(inpt) or isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)) + supports_scaling = ( + is_pure_tensor(inpt) or isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)) or _is_cvcuda_tensor(inpt) + ) if dtype is None: if self.scale and supports_scaling: warnings.warn( diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py index daf263df046..6ae5466621c 100644 --- a/torchvision/transforms/v2/functional/_misc.py +++ b/torchvision/transforms/v2/functional/_misc.py @@ -1,5 +1,5 @@ import math -from typing import Optional +from typing import Optional, TYPE_CHECKING import PIL.Image import torch @@ -13,7 +13,12 @@ from ._meta import _convert_bounding_box_format -from ._utils import _get_kernel, _register_kernel_internal, is_pure_tensor +from ._utils import _get_kernel, _import_cvcuda, _is_cvcuda_available, _register_kernel_internal, is_pure_tensor + +CVCUDA_AVAILABLE = _is_cvcuda_available() + +if TYPE_CHECKING: + import cvcuda # type: ignore[import-not-found] def normalize( @@ -340,6 +345,101 @@ def _to_dtype_tensor_dispatch(inpt: torch.Tensor, dtype: torch.dtype, scale: boo return inpt.to(dtype) +# cvcuda is only used if it is installed, so we can simply define empty mappings +_torch_to_cvcuda_dtypes: dict[torch.dtype, "cvcuda.Type"] = {} +_cvcuda_to_torch_dtypes: dict["cvcuda.Type", torch.dtype] = {} + + +def _to_dtype_image_cvcuda( + inpt: "cvcuda.Tensor", + dtype: torch.dtype = torch.float, + scale: bool = False, +) -> "cvcuda.Tensor": + """ + Convert the dtype of a CV-CUDA tensor, based on a torch.dtype. + + Args: + inpt: The CV-CUDA tensor to convert the dtype of. + dtype: The torch.dtype to convert the dtype to. + scale: Whether to scale the values to the new dtype. + There are four cases for the scaling setup: + 1. float -> float + 2. int -> int + 3. float -> int + 4. int -> float + If scale is True, the values will be scaled to the new dtype. + If scale is False, the values will not be scaled. + The scale values for float -> float are 1.0 and 0.0 respectively. + The scale values for int -> int are 2^(bit_diff) of the new dtype. + Where bit_diff is the difference in the number of bits of the new dtype and the input dtype. + The scale values for float -> int and int -> float are the maximum value of the new dtype. + + Returns: + out (cvcuda.Tensor): The CV-CUDA tensor with the converted dtype. + + """ + cvcuda = _import_cvcuda() + + if not _torch_to_cvcuda_dtypes: + _torch_to_cvcuda_dtypes[torch.uint8] = cvcuda.Type.U8 + _torch_to_cvcuda_dtypes[torch.uint16] = cvcuda.Type.U16 + _torch_to_cvcuda_dtypes[torch.uint32] = cvcuda.Type.U32 + _torch_to_cvcuda_dtypes[torch.uint64] = cvcuda.Type.U64 + _torch_to_cvcuda_dtypes[torch.int8] = cvcuda.Type.S8 + _torch_to_cvcuda_dtypes[torch.int16] = cvcuda.Type.S16 + _torch_to_cvcuda_dtypes[torch.int32] = cvcuda.Type.S32 + _torch_to_cvcuda_dtypes[torch.int64] = cvcuda.Type.S64 + _torch_to_cvcuda_dtypes[torch.float32] = cvcuda.Type.F32 + _torch_to_cvcuda_dtypes[torch.float64] = cvcuda.Type.F64 + + if not _cvcuda_to_torch_dtypes: + for k, v in _torch_to_cvcuda_dtypes.items(): + _cvcuda_to_torch_dtypes[v] = k + + dtype_in = _cvcuda_to_torch_dtypes.get(inpt.dtype) + cvc_dtype = _torch_to_cvcuda_dtypes.get(dtype) + if dtype_in is None or cvc_dtype is None: + raise ValueError(f"No torch or cvcuda dtype found for dtype {dtype} or {inpt.dtype}") + + # torchvision will overflow the values of uint16 when converting down to uint8 without scale + # example: 300 -> 255 (cvcuda) vs 300 mod 256 = 44 (torchvision) + # since it is not equivalent, raise an error for unsupported behavior + # the workaround could be using torch for dtype conversion directly via zero-copy + if dtype_in == torch.uint16 and dtype == torch.uint8 and not scale: + raise ValueError("uint16 to uint8 conversion without scale is not supported for CV-CUDA.") + + scale_val, offset = 1.0, 0.0 + if scale: + in_dtype_float = dtype_in.is_floating_point + out_dtype_float = dtype.is_floating_point + + if in_dtype_float and out_dtype_float: + scale_val, offset = 1.0, 0.0 + elif not in_dtype_float and not out_dtype_float: + in_bits = torch.iinfo(dtype_in).bits + out_bits = torch.iinfo(dtype).bits + scale_val = float(2 ** (out_bits - in_bits)) + offset = 0.0 + elif in_dtype_float and not out_dtype_float: + # Mirror the scaling factor which torchvision uses + eps = 1e-3 + max_val = float(_max_value(dtype)) + scale_val, offset = max_val + 1.0 - eps, 0.0 + else: + scale_val, offset = 1.0 / float(_max_value(dtype_in)), 0.0 + + return cvcuda.convertto( + inpt, + dtype=cvc_dtype, + scale=scale_val, + offset=offset, + ) + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(to_dtype, _import_cvcuda().Tensor)(_to_dtype_image_cvcuda) + + def sanitize_bounding_boxes( bounding_boxes: torch.Tensor, format: Optional[tv_tensors.BoundingBoxFormat] = None, From e427eaf09e83a6ada8bee9bae6d3815963058908 Mon Sep 17 00:00:00 2001 From: Justin Davis Date: Thu, 18 Dec 2025 14:37:45 -0800 Subject: [PATCH 2/3] fix transformed types list --- torchvision/transforms/v2/_misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 26749c855a4..e8631af1574 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -268,7 +268,7 @@ class ToDtype(Transform): Default: ``False``. """ - _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + _transformed_types = (torch.Tensor, _is_cvcuda_tensor) def __init__( self, dtype: Union[torch.dtype, dict[Union[type, str], Optional[torch.dtype]]], scale: bool = False From 70fb3936b77587efa0ee0989320d2b9c4fa2aadc Mon Sep 17 00:00:00 2001 From: Justin Davis Date: Thu, 18 Dec 2025 14:39:36 -0800 Subject: [PATCH 3/3] fix import changes --- test/test_transforms_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index c46fe2cbfe3..12c3fe7c0dc 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -21,7 +21,6 @@ import torchvision.transforms.v2 as transforms from common_utils import ( - assert_close, assert_equal, cache, cpu_and_cuda, @@ -43,6 +42,7 @@ ) from torch import nn +from torch.testing import assert_close from torch.utils._pytree import tree_flatten, tree_map from torch.utils.data import DataLoader, default_collate from torchvision import tv_tensors