Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
240 changes: 120 additions & 120 deletions dptb/data/build.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,29 @@
import inspect
# in-built packages
import os
from copy import deepcopy
import glob
from importlib import import_module
import logging
import inspect
from typing import Union
from copy import deepcopy
from importlib import import_module
from pathlib import Path

# third-party packages
import torch

# local packages
from dptb import data
from dptb.data import AtomicDataset, register_fields
from dptb.data.dataset import DefaultDataset
from dptb.data.dataset._deeph_dataset import DeePHE3Dataset
from dptb.data.dataset._hdf5_dataset import HDF5Dataset
from dptb.data.dataset.lmdb_dataset import LMDBDataset
from dptb import data
from dptb.data.transforms import TypeMapper, OrbitalMapper
from dptb.data import AtomicDataset, register_fields
from dptb.utils import instantiate, get_w_prefix
from dptb.utils.tools import j_loader
from dptb.utils.argcheck import normalize_setinfo, normalize_lmdbsetinfo
from dptb.utils.argcheck import collect_cutoffs
from dptb.utils.argcheck import get_cutoffs_from_model_options
import logging
import torch
import copy


log = logging.getLogger(__name__)

Expand Down Expand Up @@ -159,125 +163,121 @@ def __call__(self,
ValueError: If the dataset type is not supported.
Exception: If the info.json file is not properly provided for a trajectory folder.
"""
self.r_max = r_max
self.er_max = er_max
self.oer_max = oer_max
# set cutoff radius
self.r_max = r_max # default cutoff
self.er_max = er_max # env cutoff
self.oer_max = oer_max

# seems this warning will always show?
self.if_check_cutoffs = False
log.warning("The cutoffs in data and model are not checked. be careful!")

# set and check the dataset type
dataset_type = type
assert dataset_type in ["DefaultDataset", "DeePHDataset", "HDF5Dataset", "LMDBDataset"], \
f"The dataset type {dataset_type} is not supported. Please check the type."

# See if we can get a OrbitalMapper.
if basis is not None:
idp = OrbitalMapper(basis=basis)
else:
idp = None

if dataset_type in ["DefaultDataset", "DeePHDataset", "HDF5Dataset", "LMDBDataset"]:
assert prefix is not None, "The prefix is not provided. Please provide the prefix to select the trajectory folders."
prefix_folders = glob.glob(f"{root}/{prefix}{separator}*")
include_folders=[]
for idir in prefix_folders:
if os.path.isdir(idir):
if not glob.glob(os.path.join(idir, '*.dat')) \
and not glob.glob(os.path.join(idir, '*.traj')) \
and not glob.glob(os.path.join(idir, '*.h5')) \
and not glob.glob(os.path.join(idir, '*.mdb')):
raise Exception(f"{idir} does not have the proper traj data files. Please check the data files.")
include_folders.append(idir.split('/')[-1])

assert isinstance(include_folders, list) and len(include_folders) > 0, "No trajectory folders are found. Please check the prefix."

# We need to check the `info.json` very carefully here.
# Different `info` points to different dataset,
# even if the data files in `root` are basically the same.
info_files = {}

# See if a public info is provided.
#if "info.json" in os.listdir(root):
if os.path.exists(f"{root}/info.json"):
public_info = j_loader(os.path.join(root, "info.json"))
if dataset_type == "LMDBDataset":
public_info = {}
log.info("A public `info.json` file is provided, but will not be used anymore for LMDBDataset.")
else:
public_info = normalize_setinfo(public_info)
log.info("A public `info.json` file is provided, and will be used by the subfolders who do not have their own `info.json` file.")
idp = None if basis is None else OrbitalMapper(basis=basis)

# filter to get the valid folders
assert prefix is not None, \
"The prefix is not provided. Please provide the prefix to select the trajectory folders."
valid_folders, delim = [], separator
for folder in Path(root).glob(f"{prefix}{delim}*"):
if folder.is_dir():
assert any(folder.glob(f'*.{ext}') for ext in ['dat', 'traj', 'h5', 'mdb']), \
f'{folder} does not have the proper traj data files. Please check the data files.'
valid_folders.append(folder.name)
assert isinstance(valid_folders, list) and len(valid_folders) > 0, \
"No trajectory folders are found. Please check the prefix."

# We need to check the `info.json` very carefully here.
# Different `info` points to different dataset,
# even if the data files in `root` are basically the same.
info_files, public_info = {}, None
# the info_files is a dict that maps the folder name to the info in dict format.

# setting on the public (default) info that will be the default for subfolders without its
# own info.json file.
finfo = Path(root) / "info.json"
if finfo.exists():
public_info = j_loader(finfo)
if dataset_type == "LMDBDataset":
public_info = {}
log.info("A public `info.json` file is provided, "
"but will not be used anymore for LMDBDataset.")
else:
public_info = None

# Load info in each trajectory folders seperately.
for file in include_folders:
#if "info.json" in os.listdir(os.path.join(root, file)):

if dataset_type == "LMDBDataset":
info_files[file] = {}
elif os.path.exists(f"{root}/{file}/info.json"):
# use info provided in this trajectory.
info = j_loader(f"{root}/{file}/info.json")
info = normalize_setinfo(info)
info_files[file] = info
elif public_info is not None: # not lmbd and no info in subfolder, then must use public info.
# use public info instead
# yaml will not dump correctly if this is not a deepcopy.
info_files[file] = deepcopy(public_info)
else: # not lmdb no info in subfolder and no public info. then raise error.
log.error(f"for {dataset_type} type, the info.json is not properly provided for `{file}`")
raise ValueError(f"for {dataset_type} type, the info.json is not properly provided for `{file}`")

# We will sort the info_files here.
# The order itself is not important, but must be consistant for the same list.
info_files = {key: info_files[key] for key in sorted(info_files)}

for ikey in info_files:
info_files[ikey].update({'r_max': r_max, 'er_max': er_max, 'oer_max': oer_max})

if dataset_type == "DeePHDataset":
dataset = DeePHE3Dataset(
root=root,
type_mapper=idp,
get_Hamiltonian=get_Hamiltonian,
get_eigenvalues=get_eigenvalues,
info_files = info_files
)
elif dataset_type == "DefaultDataset":
dataset = DefaultDataset(
root=root,
type_mapper=idp,
get_Hamiltonian=get_Hamiltonian,
get_overlap=get_overlap,
get_DM=get_DM,
get_eigenvalues=get_eigenvalues,
info_files = info_files
)
elif dataset_type == "HDF5Dataset":
dataset = HDF5Dataset(
root=root,
type_mapper=idp,
get_Hamiltonian=get_Hamiltonian,
get_overlap=get_overlap,
get_DM=get_DM,
get_eigenvalues=get_eigenvalues,
info_files = info_files
)
elif dataset_type == "LMDBDataset":
dataset = LMDBDataset(
root=root,
type_mapper=idp,
orthogonal=orthogonal,
get_Hamiltonian=get_Hamiltonian,
get_overlap=get_overlap,
get_DM=get_DM,
get_eigenvalues=get_eigenvalues,
info_files = info_files
)
public_info = normalize_setinfo(public_info)
log.info("A public `info.json` file is provided, "
"and will be used by the subfolders who do not have their own `info.json` file.")

# Load info in each trajectory folders seperately...
for folder in valid_folders:
# for LMDBDataset, the information is not set here
if dataset_type == "LMDBDataset":
info_files[folder] = {}
continue

# otherwise, at least the public info would be the default
finfo = Path(root) / folder / "info.json"
if not finfo.exists() and public_info is None: # no info in subfolder and no public info. then raise error.
log.error(f"for {dataset_type} type, the info.json is not properly provided for `{folder}`")
raise ValueError(f"for {dataset_type} type, the info.json is not properly provided for `{folder}`")

# the case that info is individually provided
if finfo.exists():
info_files[folder] = normalize_setinfo(j_loader(finfo))
else:
# use public info instead
assert public_info is not None
# yaml will not dump correctly if this is not a deepcopy.
info_files[folder] = deepcopy(public_info)

else:
raise ValueError(f"Not support dataset type: {type}.")

if not self.if_check_cutoffs:
log.warning("The cutoffs in data and model are not checked. be careful!")

return dataset
# finally, we sort the info_files here.
# The order itself is not important, but must be consistant for the same list.
info_files = {key: info_files[key] for key in sorted(info_files)}

# add the cutoff radius information for each valid folder (contains all the data files)
for folder in info_files:
info_files[folder].update({'r_max' : r_max,
'er_max' : er_max,
'oer_max': oer_max})

# after preprocessing, we can build the dataset
if dataset_type == "DeePHDataset":
return DeePHE3Dataset(root=root,
type_mapper=idp,
get_Hamiltonian=get_Hamiltonian,
get_eigenvalues=get_eigenvalues,
info_files=info_files)
elif dataset_type == "DefaultDataset":
return DefaultDataset(root=root,
type_mapper=idp,
get_Hamiltonian=get_Hamiltonian,
get_overlap=get_overlap,
get_DM=get_DM,
get_eigenvalues=get_eigenvalues,
info_files=info_files)
elif dataset_type == "HDF5Dataset":
return HDF5Dataset(root=root,
type_mapper=idp,
get_Hamiltonian=get_Hamiltonian,
get_overlap=get_overlap,
get_DM=get_DM,
get_eigenvalues=get_eigenvalues,
info_files=info_files)
else:
assert dataset_type == "LMDBDataset"
return LMDBDataset(root=root,
type_mapper=idp,
orthogonal=orthogonal,
get_Hamiltonian=get_Hamiltonian,
get_overlap=get_overlap,
get_DM=get_DM,
get_eigenvalues=get_eigenvalues,
info_files=info_files)
Comment on lines +248 to +280
Copy link
Contributor

@coderabbitai coderabbitai bot Dec 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, let's check the build.py file to find the __call__ method and from_model method
cd dptb/data && wc -l build.py

Repository: deepmodeling/DeePTB

Length of output: 182


🏁 Script executed:

# Read the relevant sections of build.py to understand the method signatures
cat -n dptb/data/build.py | head -100

Repository: deepmodeling/DeePTB

Length of output: 182


🏁 Script executed:

# Search for the __call__ method definition in build.py
rg -A 20 "def __call__" dptb/data/build.py

Repository: deepmodeling/DeePTB

Length of output: 182


🏁 Script executed:

# Search for the from_model method definition
rg -A 20 "def from_model" dptb/data/build.py

Repository: deepmodeling/DeePTB

Length of output: 182


🏁 Script executed:

# Look at the dataset class definitions to see if they accept **kwargs
fd "\.py$" dptb/data | xargs rg -l "class DeePHE3Dataset|class DefaultDataset|class HDF5Dataset|class LMDBDataset"

Repository: deepmodeling/DeePTB

Length of output: 182


🌐 Web query:

DeePTB dptb/data/build.py DatasetBuilder __call__ github

💡 Result:

You can find the DatasetBuilder.call implementation in dptb/data/build.py of the DeePTB repo on GitHub (shows the full function signature and logic for building DefaultDataset / DeePHDataset / HDF5Dataset / LMDBDataset). Source (raw file): [1]; repo page: [2].

Sources:
[1] DeePTB dptb/data/build.py (raw). https://github.com/deepmodeling/DeePTB/blob/main/dptb/data/build.py?raw=true.
[2] deepmodeling/DeePTB GitHub repository. https://github.com/deepmodeling/DeePTB.


🌐 Web query:

site:github.com/deepmodeling/DeePTB dptb/data/build.py __call__ kwargs

💡 Result:

The DatasetBuilder.call signature (from dptb/data/build.py) is:

def call(self,
root: str,
r_max: Union[float,int,dict],
er_max: Union[float,int,dict]=None,
oer_max: Union[float,int,dict]=None,
type: str = "DefaultDataset",
prefix: str = None,
separator: str = '.',
get_Hamiltonian: bool = False,
get_overlap: bool = False,
get_DM: bool = False,
get_eigenvalues: bool = False,
orthogonal: bool = False,
basis: str = None,
**kwargs
)

Notes:

  • These named args (root, r_max, er_max, oer_max, type, prefix, separator, get_Hamiltonian, get_overlap, get_DM, get_eigenvalues, orthogonal, basis) are the documented/used parameters; **kwargs is accepted but not referenced elsewhere in the function (i.e., extra keyword args are accepted but ignored). [1]

Source: dptb/data/build.py (DeePTB repository). [1]


🏁 Script executed:

# Fetch the raw build.py file from GitHub
curl -s "https://github.com/deepmodeling/DeePTB/blob/main/dptb/data/build.py?raw=true" | head -300

Repository: deepmodeling/DeePTB

Length of output: 182


🏁 Script executed:

# Get more of the file to find __call__ method
curl -s "https://github.com/deepmodeling/DeePTB/blob/main/dptb/data/build.py?raw=true" | tail -n +200 | head -150

Repository: deepmodeling/DeePTB

Length of output: 182


🏁 Script executed:

# Search for dataset class definitions
curl -s "https://github.com/deepmodeling/DeePTB/blob/main/dptb/data/__init__.py?raw=true"

Repository: deepmodeling/DeePTB

Length of output: 182


🏁 Script executed:

# Look for the dataset classes - try common locations
for file in dptb/data/dataset.py dptb/data/datasets.py dptb/datasets.py; do
  echo "=== Checking $file ===" 
  curl -s "https://raw.githubusercontent.com/deepmodeling/DeePTB/main/$file" | head -200
done

Repository: deepmodeling/DeePTB

Length of output: 182


🌐 Web query:

site:github.com/deepmodeling/DeePTB "class DeePHE3Dataset" OR "class DefaultDataset" OR "class HDF5Dataset" OR "class LMDBDataset" __init__

💡 Result:

I searched the deepmodeling/DeePTB repository for those class names.

Findings:

  • "DefaultDataset" is referenced in the DeePTB docs (data_options example). [1][2]
  • I did not find occurrences of a class named "DeePHE3Dataset", nor explicit class definitions named "HDF5Dataset" or "LMDBDataset" in the repository search results I could access.

Sources:

  1. deepmodeling/DeePTB repository main page. [1]
  2. DeePTB docs (quick start input / data_options showing DefaultDataset). [2]

If you want, I can (a) do a deeper code search through the repo files (open specific subfolders) and return exact file/line hits, or (b) provide direct links to any matching source files. Which would you prefer?


🌐 Web query:

deepmodeling/DeePTB dataset classes __init__ signature DefaultDataset HDF5Dataset LMDBDataset

💡 Result:

I can fetch those exact init signatures from the DeePTB repo and return them. Do you want me to look them up on GitHub now?


__call__ accepts **kwargs but silently ignores them instead of forwarding to dataset constructors

The __call__ method signature includes **kwargs, but these are never passed to any of the four dataset constructors (DeePHE3Dataset, DefaultDataset, HDF5Dataset, LMDBDataset). This means callers cannot configure dataset-specific options via build_dataset(..., some_option=...) — such arguments are silently dropped, which is surprising and error-prone.

Either forward **kwargs to the dataset constructors, or remove the parameter from __call__ and from_model to be explicit that extra options are not supported.

Copy link
Contributor Author

@kirk0830 kirk0830 Dec 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@coderabbitai yes I noticed this, too. But this is not the problem brought about by this PR, right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kirk0830 You're absolutely right! This is a pre-existing issue with the **kwargs signature, not something introduced by your refactoring. Your PR is focused on improving the folder discovery logic, Path usage, and info.json handling – all good changes.

The **kwargs was already there and remains unused in the original code as well. If you'd like to address it in a future PR (either by forwarding the kwargs or removing the parameter), I can help open a tracking issue for it. Otherwise, feel free to keep this PR focused on the refactoring work you've done.


def from_model(self,
model,
Expand Down