From 1f0976047e9a16a3be075d5cc2786ce77d56f00b Mon Sep 17 00:00:00 2001 From: "guangli.bao" Date: Fri, 5 Dec 2025 16:14:26 +0800 Subject: [PATCH] UT for src/guidellm/data/deserializers/huggingface.py Signed-off-by: guangli.bao --- .../data/deserializers/test_huggingface.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 tests/unit/data/deserializers/test_huggingface.py diff --git a/tests/unit/data/deserializers/test_huggingface.py b/tests/unit/data/deserializers/test_huggingface.py new file mode 100644 index 00000000..aa1ae882 --- /dev/null +++ b/tests/unit/data/deserializers/test_huggingface.py @@ -0,0 +1,61 @@ +import pytest +from datasets import Dataset + +from guidellm.data.deserializers.huggingface import ( + HuggingFaceDatasetDeserializer, +) + + +@pytest.fixture +def processor_factory(): + return None + + +@pytest.fixture +def deserializer(): + return HuggingFaceDatasetDeserializer() + + +def test_hf_dataset_direct_return(deserializer, processor_factory): + # build one simple HF dataset + data = Dataset.from_dict({"text": ["hello", "world"]}) + result = deserializer(data, processor_factory, random_seed=42) + assert result is data, "return original Dataset object" + + +def test_local_hf_directory_dataset(deserializer, processor_factory, tmp_path): + # --- 1. build one simple HF dataset --- + dataset = Dataset.from_dict({"id": [1, 2], "text": ["a", "b"]}) + # --- 2. Save to a local directory --- + dataset_dir = tmp_path / "local_hf_dataset" + dataset.save_to_disk(dataset_dir) + + # --- 3. call HF DatasetDeserializer --- + result = deserializer( + dataset_dir, + processor_factory, + random_seed=123, + ) + + # --- 4. assertion --- + assert isinstance(result, Dataset) + assert result["text"] == ["a", "b"] + + +@pytest.mark.parametrize( + "internal_ds_name", + [ + "mnist", + "imdb", + ], +) +def test_hf_internal_dataset(deserializer, processor_factory, internal_ds_name): + result = deserializer(internal_ds_name, processor_factory, random_seed=42) + + assert isinstance(result, (Dataset | dict)), "HF dataset loading failed" + assert "train" in result or isinstance(result, Dataset), ( + "Expected 'train' split in the loaded dataset" + ) + assert "test" in result or isinstance(result, Dataset), ( + "Expected 'test' split in the loaded dataset" + )