From da41d09e2532337cbf0663dabe0ebcd265a13d47 Mon Sep 17 00:00:00 2001 From: Charles Campbell Date: Tue, 28 Oct 2025 18:34:09 -0400 Subject: [PATCH 01/50] initialize diffractgpt script --- .../ramangpt/dataset_atomgpt_spectra.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py diff --git a/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py new file mode 100644 index 0000000..9f984b1 --- /dev/null +++ b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py @@ -0,0 +1,24 @@ +import numpy as np +from atomgpt.inverse_models.utils import smooth_xrd +from jarvis.io.vasp.inputs import Poscar +from jarvis.db.figshare import data +from jarvis.core.atoms import Atoms +from tqdm import tqdm + +d = data("dft_3d") +# d = data('alex_pbe_hull') + +f = open("id_prop.csv", "w") +count = 0 +for i in tqdm(d, total=len(d)): + # if count<10: + atoms = Atoms.from_dict(i["atoms"]) + jid = i["jid"] + poscar_name = "POSCAR-" + jid + ".vasp" + atoms.write_poscar(poscar_name) + y_new_str, cccc = smooth_xrd(atoms=atoms, intvl=0.3, thetas=[0, 90]) + f.write("%s,%s\n" % (poscar_name, y_new_str)) + count += 1 + # if count == max_samples: + # break +f.close() From bea7d59858faea0fadc8110ea4028640363a6033 Mon Sep 17 00:00:00 2001 From: Charles Campbell Date: Tue, 28 Oct 2025 18:39:13 -0400 Subject: [PATCH 02/50] initialize correct dataset script --- .../ramangpt/dataset_atomgpt_spectra.py | 24 --- .../ramangpt/dataset_atomgpt_spectra2.py | 137 ++++++++++++++++++ 2 files changed, 137 insertions(+), 24 deletions(-) delete mode 100644 atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py create mode 100644 atomgpt/scripts/ramangpt/dataset_atomgpt_spectra2.py diff --git a/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py deleted file mode 100644 index 9f984b1..0000000 --- a/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py +++ /dev/null @@ -1,24 +0,0 @@ -import numpy as np -from atomgpt.inverse_models.utils import smooth_xrd -from jarvis.io.vasp.inputs import Poscar -from jarvis.db.figshare import data -from jarvis.core.atoms import Atoms -from tqdm import tqdm - -d = data("dft_3d") -# d = data('alex_pbe_hull') - -f = open("id_prop.csv", "w") -count = 0 -for i in tqdm(d, total=len(d)): - # if count<10: - atoms = Atoms.from_dict(i["atoms"]) - jid = i["jid"] - poscar_name = "POSCAR-" + jid + ".vasp" - atoms.write_poscar(poscar_name) - y_new_str, cccc = smooth_xrd(atoms=atoms, intvl=0.3, thetas=[0, 90]) - f.write("%s,%s\n" % (poscar_name, y_new_str)) - count += 1 - # if count == max_samples: - # break -f.close() diff --git a/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra2.py b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra2.py new file mode 100644 index 0000000..4751bf1 --- /dev/null +++ b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra2.py @@ -0,0 +1,137 @@ +# This is an updated version of diffractgpt using find_peaks algorithm +import numpy as np +from scipy.signal import find_peaks +from jarvis.analysis.diffraction.xrd import XRD +from jarvis.core.atoms import Atoms +from jarvis.db.figshare import data +from tqdm import tqdm +from jarvis.db.jsonutils import dumpjson +def get_crystal_string_t(atoms): + lengths = atoms.lattice.abc # structure.lattice.parameters[:3] + angles = atoms.lattice.angles + atom_ids = atoms.elements + frac_coords = atoms.frac_coords + + crystal_str = ( + " ".join(["{0:.2f}".format(x) for x in lengths]) + + "\n" + + " ".join([str(int(x)) for x in angles]) + + "\n" + + "\n".join( + [ + str(t) + " " + " ".join(["{0:.3f}".format(x) for x in c]) + for t, c in zip(atom_ids, frac_coords) + ] + ) + ) + + # crystal_str = atoms_describer(atoms) + "\n*\n" + crystal_str + return crystal_str + + +def gaussian_recast(x_original=[], y_original=[], x_new=[], sigma=.1): + y_new = np.zeros_like(x_new, dtype=np.float64) + for x0, amp in zip(x_original, y_original): + y_new += amp * np.exp(-0.5 * ((x_new - x0) / sigma) ** 2) + x_new=np.array(x_new) + y_new=np.array(y_new) + return x_new, y_new + +def make_diffractgpt_prompt(atoms, jid='na',thetas=[0, 90], num_peaks=20): + """Reads 2θ–intensity data, extracts top N peaks, and builds a prompt for DiffractGPT.""" + two_theta, d, intensity = XRD(thetas=thetas).simulate(atoms=atoms) + intensity = np.array(intensity) + intensity /= intensity.max() + + two_theta = np.array(two_theta) + x_new = np.arange(0, 90, .1) + two_theta,intensity = gaussian_recast(x_original=two_theta,y_original=intensity,x_new=x_new) + #print("two_theta",two_theta) + #print("intensity",intensity) + intensity /= intensity.max() + #print(intensity,max(intensity),len(intensity)) + + # Find all peaks (with minimal height threshold to ignore noise) + peaks, props = find_peaks(intensity, height=0.01, distance=1,prominence=0.05) + #print("peaks",peaks) + # Get top N peaks by intensity + top_indices = np.argsort(props['peak_heights'])[::-1][:num_peaks] + top_peaks = peaks[top_indices] + top_peaks_sorted = top_peaks[np.argsort(two_theta[top_peaks])] + + # Create list of (2θ, intensity) + peak_list = [(round(two_theta[p], 2), round(intensity[p], 2)) for p in top_peaks_sorted] + + # Build DiffractGPT prompt + peak_text = ", ".join([f"{t}°({i})" for t, i in peak_list]) + print(jid,peak_text) + num_peaks = len(peaks) + formula=atoms.composition.reduced_formula + input = ( + f"The chemical formula is: {formula}.\n" + f"The XRD pattern shows main peaks at: {peak_text}.\n" + f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types." + ) + output= get_crystal_string_t(atoms) + info={} + info["instruction"]= "Below is a description of a material." + info["input"]=input + info['id']=jid + info['peak_text']=peak_text + info["output"]=output + return info + +# Example usage +if __name__ == "__main__": + #atoms = Atoms.from_poscar('POSCAR') + jids=["JVASP-32","JVASP-15014","JVASP-1002","JVASP-107","JVASP-17957","JVASP-1151"] + f=open('id_prop.csv','w') + dat=data('dft_3d') #[0:num_samples] + test=[] + train=[] + for i in tqdm(dat,total=len(dat)): + if i['jid'] in jids: + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + filename='POSCAR-'+i['jid'] + atoms.write_poscar(filename) + line=filename+','+prompt['peak_text']+'\n' + f.write(line) + + + + #print(i['jid'],prompt) + train.append(prompt) + if i['jid']=="JVASP-32": + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + #print(i['jid'],prompt) + test.append(prompt) + train.append(prompt) + if i['jid']=="JVASP-15014": + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + #print(i['jid'],prompt) + test.append(prompt) + train.append(prompt) + if i['jid']=="JVASP-1002": + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + #print(i['jid'],prompt) + test.append(prompt) + train.append(prompt) + if i['jid']=="JVASP-107": + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + #print(i['jid'],prompt) + test.append(prompt) + train.append(prompt) + if i['jid']=="JVASP-17957": + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + #print(i['jid'],prompt) + test.append(prompt) + train.append(prompt) + f.close() + dumpjson(data=train,filename='alpaca_prop_train.json') + dumpjson(data=test,filename='alpaca_prop_test.json') From 5e6633b7bd8e865eff0a65cedc17ed9b1ea35387 Mon Sep 17 00:00:00 2001 From: crhysc Date: Wed, 29 Oct 2025 13:30:45 -0400 Subject: [PATCH 03/50] initialize code to make train test alpaca jsons --- atomgpt/scripts/ramangpt/alpaca_train_test.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 atomgpt/scripts/ramangpt/alpaca_train_test.py diff --git a/atomgpt/scripts/ramangpt/alpaca_train_test.py b/atomgpt/scripts/ramangpt/alpaca_train_test.py new file mode 100644 index 0000000..dc10970 --- /dev/null +++ b/atomgpt/scripts/ramangpt/alpaca_train_test.py @@ -0,0 +1,55 @@ +import argparse +import json +from pathlib import Path +import numpy as np +from scipy.signal import find_peaks +from jarvis.core.atoms import Atoms +from tqdm import tqdm +import csv +import pandas as pd + +def get_crystal_string_t(atoms): + lengths = atoms.lattice.abc + angles = atoms.lattice.angles + atom_ids = atoms.elements + frac_coords = atoms.frac_coords + + crystal_str = ( + " ".join(["{0:.2f}".format(x) for x in lengths]) + + "\n" + + " ".join([str(int(x)) for x in angles]) + + "\n" + + "\n".join( + [ + str(t) + " " + " ".join(["{0:.3f}".format(x) for x in c]) + for t, c in zip(atom_ids, frac_coords) + ] + ) + ) + return crystal_str + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("raman_json_path") + args = parser.parse_args() + df = pd.DataFrame() + with open(args.raman_json_path, mode="r", encoding="utf-8") as file: + data = json.load(file) + atoms = [] + for obj in data: + atoms = Atoms( + lattice_mat=data[obj]['atoms']['lattice_mat'], + coords=data[obj]['atoms']['coords'], + elements=data[obj]['atoms']['elements'] + ) + jid = data[obj]['id'] + intensities = data[obj]['raman_activity'] + frequencies = data[obj]['freq_cm'] + + + + + + +if __name__ == '__main__': + main() From a6d7842ba632cce5859e36abd69ac7db6b156b9e Mon Sep 17 00:00:00 2001 From: crhysc Date: Wed, 29 Oct 2025 15:48:55 -0400 Subject: [PATCH 04/50] version 1. should make sentences --- atomgpt/scripts/ramangpt/alpaca_train_test.py | 170 ++++++++++++++---- 1 file changed, 139 insertions(+), 31 deletions(-) diff --git a/atomgpt/scripts/ramangpt/alpaca_train_test.py b/atomgpt/scripts/ramangpt/alpaca_train_test.py index dc10970..516d8e3 100644 --- a/atomgpt/scripts/ramangpt/alpaca_train_test.py +++ b/atomgpt/scripts/ramangpt/alpaca_train_test.py @@ -1,55 +1,163 @@ +#!/usr/bin/env python3 +# make_raman_alpaca.py +# +# Read Raman JSON, keep only modes with activity > 0, format activities to N decimals, +# and write Alpaca-style train/test JSONs compatible with your finetuning script. + import argparse import json +import random from pathlib import Path + import numpy as np -from scipy.signal import find_peaks -from jarvis.core.atoms import Atoms from tqdm import tqdm -import csv -import pandas as pd +from jarvis.core.atoms import Atoms +from jarvis.db.jsonutils import dumpjson -def get_crystal_string_t(atoms): + +def get_crystal_string_t(atoms: Atoms) -> str: lengths = atoms.lattice.abc angles = atoms.lattice.angles atom_ids = atoms.elements frac_coords = atoms.frac_coords - crystal_str = ( - " ".join(["{0:.2f}".format(x) for x in lengths]) + " ".join("{0:.2f}".format(x) for x in lengths) + "\n" - + " ".join([str(int(x)) for x in angles]) + + " ".join(str(int(x)) for x in angles) + "\n" + "\n".join( - [ - str(t) + " " + " ".join(["{0:.3f}".format(x) for x in c]) - for t, c in zip(atom_ids, frac_coords) - ] + f"{t} " + " ".join("{0:.3f}".format(x) for x in c) + for t, c in zip(atom_ids, frac_coords) ) ) return crystal_str -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("raman_json_path") - args = parser.parse_args() - df = pd.DataFrame() - with open(args.raman_json_path, mode="r", encoding="utf-8") as file: - data = json.load(file) - atoms = [] - for obj in data: - atoms = Atoms( - lattice_mat=data[obj]['atoms']['lattice_mat'], - coords=data[obj]['atoms']['coords'], - elements=data[obj]['atoms']['elements'] - ) - jid = data[obj]['id'] - intensities = data[obj]['raman_activity'] - frequencies = data[obj]['freq_cm'] - +def format_fixed_decimals(val: float, decimals: int = 6) -> str: + """Format a number with fixed decimal places (handles scientific-notation inputs).""" + try: + v = float(val) + except Exception: + v = np.nan + if not np.isfinite(v): + return "0" # safe fallback + return f"{v:.{decimals}f}" + + +def make_raman_record( + entry: dict, + freq_decimals: int = 2, + activity_decimals: int = 6, +) -> dict | None: + atoms_dict = entry.get("atoms") + if not atoms_dict: + return None + + try: + atoms = Atoms.from_dict(atoms_dict) + except Exception: + return None + + try: + formula = atoms.composition.reduced_formula + except Exception: + formula = "Unknown" + + # Coerce to float; handles numbers or strings like "7.88E-7" + freqs = np.array(entry.get("freq_cm", []), dtype=float) + acts = np.array(entry.get("raman_activity", []), dtype=float) + + if freqs.size == 0 or acts.size == 0 or freqs.size != acts.size: + return None + + mask = acts > 0.0 + if not np.any(mask): + return None + + freqs_nz = freqs[mask] + acts_nz = acts[mask] + order = np.argsort(freqs_nz) + freqs_nz = freqs_nz[order] + acts_nz = acts_nz[order] + fmt_f = f"{{0:.{freq_decimals}f}}" + pairs = [ + f"{fmt_f.format(float(freq))} cm^-1({format_fixed_decimals(float(act), activity_decimals)})" + for freq, act in zip(freqs_nz, acts_nz) + ] + raman_text = ", ".join(pairs) + rec = { + "instruction": "Below is a description of a material.", + "input": ( + f"The chemical formula is: {formula}.\n" + f"The Raman spectrum shows active modes at: {raman_text}.\n" + f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types." + ), + "output": get_crystal_string_t(atoms), + "id": entry.get("id", "na"), # kept in BOTH train & test for your evaluator + "raman_text": raman_text, # extra field; trainer ignores it + } + return rec -if __name__ == '__main__': + +def main(): + p = argparse.ArgumentParser( + description="Build Alpaca train/test JSONs from a Raman spectroscopy dataset." + ) + p.add_argument("--raman-json", type=Path, required=True, + help="Path to Raman JSON file (list of entries).") + p.add_argument("--test-ratio", type=float, default=0.1, + help="Fraction for test split (default: 0.10).") + p.add_argument("--seed", type=int, default=42, + help="Random seed for the split (default: 42).") + p.add_argument("--train-out", type=Path, default=Path("alpaca_prop_train.json"), + help="Output path for train JSON.") + p.add_argument("--test-out", type=Path, default=Path("alpaca_prop_test.json"), + help="Output path for test JSON.") + p.add_argument("--freq-decimals", type=int, default=2, + help="Decimals for frequencies in cm^-1 (default: 2).") + p.add_argument("--activity-decimals", type=int, default=6, + help="Decimals for Raman activities (default: 6).") + args = p.parse_args() + + with args.raman_json.open("r", encoding="utf-8") as f: + raw = json.load(f) + + records = [] + for entry in tqdm(raw, total=len(raw), desc="Processing Raman entries"): + rec = make_raman_record( + entry, + freq_decimals=args.freq_decimals, + activity_decimals=args.activity_decimals, + ) + if rec is not None: + records.append(rec) + + if not records: + raise SystemExit("No valid records with nonzero Raman activity were found.") + + rng = random.Random(args.seed) + rng.shuffle(records) + n_total = len(records) + n_test = max(1, int(round(args.test_ratio * n_total))) + test = records[:n_test] + train = records[n_test:] + + dumpjson(data=train, filename=str(args.train_out)) + dumpjson(data=test, filename=str(args.test_out)) + + print(f"Wrote {len(train)} train records → {args.train_out}") + print(f"Wrote {len(test)} test records → {args.test_out}") + + # Quick compatibility check for the evaluator (needs id/input/output) + ex = test[0] + for k in ("id", "instruction", "input", "output"): + if k not in ex: + print(f"WARNING: key '{k}' missing from test example!") + + +if __name__ == "__main__": main() + From fbb0c3acabf1b41161e589e4e061f4d76177cc28 Mon Sep 17 00:00:00 2001 From: crhysc Date: Wed, 29 Oct 2025 15:54:26 -0400 Subject: [PATCH 05/50] initialize runner --- atomgpt/scripts/ramangpt/runner.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 atomgpt/scripts/ramangpt/runner.sh diff --git a/atomgpt/scripts/ramangpt/runner.sh b/atomgpt/scripts/ramangpt/runner.sh new file mode 100644 index 0000000..bc4e309 --- /dev/null +++ b/atomgpt/scripts/ramangpt/runner.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +python make_raman_alpaca.py \ + --raman-json ramandb.json \ + --test-ratio 0.1 \ + --train-out alpaca_prop_train.json \ + --test-out alpaca_prop_test.json \ + --freq-decimals 9 \ + --activity-decimals 6 + From fae7eee45c4c7e19f578acd92ac2dba0db8f2b2c Mon Sep 17 00:00:00 2001 From: Charles Campbell Date: Wed, 29 Oct 2025 16:04:25 -0400 Subject: [PATCH 06/50] name change --- .../ramangpt/{alpaca_train_test.py => make_raman_alpaca.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename atomgpt/scripts/ramangpt/{alpaca_train_test.py => make_raman_alpaca.py} (100%) diff --git a/atomgpt/scripts/ramangpt/alpaca_train_test.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py similarity index 100% rename from atomgpt/scripts/ramangpt/alpaca_train_test.py rename to atomgpt/scripts/ramangpt/make_raman_alpaca.py From b7a9cde7c4d700e98b2f60015cb753600e2417e2 Mon Sep 17 00:00:00 2001 From: Charles Campbell Date: Wed, 29 Oct 2025 16:12:55 -0400 Subject: [PATCH 07/50] update prompt to include () --- atomgpt/scripts/ramangpt/make_raman_alpaca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py index 516d8e3..5529fb1 100644 --- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py +++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py @@ -92,7 +92,7 @@ def make_raman_record( "instruction": "Below is a description of a material.", "input": ( f"The chemical formula is: {formula}.\n" - f"The Raman spectrum shows active modes at: {raman_text}.\n" + f"The Raman spectrum shows active modes with normalized intensities () at: {raman_text}.\n" f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types." ), "output": get_crystal_string_t(atoms), From ff64e8361b6f745b0371770ac9aa779e1eb8e327 Mon Sep 17 00:00:00 2001 From: Charles Campbell Date: Wed, 29 Oct 2025 16:17:16 -0400 Subject: [PATCH 08/50] add cm^-1 to prompt --- atomgpt/scripts/ramangpt/make_raman_alpaca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py index 5529fb1..fcedb48 100644 --- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py +++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py @@ -83,7 +83,7 @@ def make_raman_record( fmt_f = f"{{0:.{freq_decimals}f}}" pairs = [ - f"{fmt_f.format(float(freq))} cm^-1({format_fixed_decimals(float(act), activity_decimals)})" + f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act), activity_decimals)})" for freq, act in zip(freqs_nz, acts_nz) ] raman_text = ", ".join(pairs) @@ -92,7 +92,7 @@ def make_raman_record( "instruction": "Below is a description of a material.", "input": ( f"The chemical formula is: {formula}.\n" - f"The Raman spectrum shows active modes with normalized intensities () at: {raman_text}.\n" + f"The Raman spectrum shows active modes in cm^-1 with normalized intensities () at: {raman_text}.\n" f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types." ), "output": get_crystal_string_t(atoms), From 3b987f8e9d25e147d9d3b34560a1cf467d31fd83 Mon Sep 17 00:00:00 2001 From: Charles Campbell Date: Fri, 31 Oct 2025 14:44:37 -0400 Subject: [PATCH 09/50] remove rounding bug --- atomgpt/scripts/ramangpt/make_raman_alpaca.py | 118 ++---------------- 1 file changed, 8 insertions(+), 110 deletions(-) diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py index fcedb48..dd2e8b4 100644 --- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py +++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py @@ -1,49 +1,3 @@ -#!/usr/bin/env python3 -# make_raman_alpaca.py -# -# Read Raman JSON, keep only modes with activity > 0, format activities to N decimals, -# and write Alpaca-style train/test JSONs compatible with your finetuning script. - -import argparse -import json -import random -from pathlib import Path - -import numpy as np -from tqdm import tqdm -from jarvis.core.atoms import Atoms -from jarvis.db.jsonutils import dumpjson - - -def get_crystal_string_t(atoms: Atoms) -> str: - lengths = atoms.lattice.abc - angles = atoms.lattice.angles - atom_ids = atoms.elements - frac_coords = atoms.frac_coords - crystal_str = ( - " ".join("{0:.2f}".format(x) for x in lengths) - + "\n" - + " ".join(str(int(x)) for x in angles) - + "\n" - + "\n".join( - f"{t} " + " ".join("{0:.3f}".format(x) for x in c) - for t, c in zip(atom_ids, frac_coords) - ) - ) - return crystal_str - - -def format_fixed_decimals(val: float, decimals: int = 6) -> str: - """Format a number with fixed decimal places (handles scientific-notation inputs).""" - try: - v = float(val) - except Exception: - v = np.nan - if not np.isfinite(v): - return "0" # safe fallback - return f"{v:.{decimals}f}" - - def make_raman_record( entry: dict, freq_decimals: int = 2, @@ -70,7 +24,11 @@ def make_raman_record( if freqs.size == 0 or acts.size == 0 or freqs.size != acts.size: return None - mask = acts > 0.0 + # NEW: drop NaN/inf, then drop anything that *rounds* to 0.00... at the chosen precision + acts = np.where(np.isfinite(acts), acts, 0.0) + rounded = np.round(acts, decimals=activity_decimals) + mask = rounded != 0.0 # this also excludes "-0.00" + if not np.any(mask): return None @@ -83,7 +41,7 @@ def make_raman_record( fmt_f = f"{{0:.{freq_decimals}f}}" pairs = [ - f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act), activity_decimals)})" + f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act), activity_decicals)})" for freq, act in zip(freqs_nz, acts_nz) ] raman_text = ", ".join(pairs) @@ -96,68 +54,8 @@ def make_raman_record( f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types." ), "output": get_crystal_string_t(atoms), - "id": entry.get("id", "na"), # kept in BOTH train & test for your evaluator - "raman_text": raman_text, # extra field; trainer ignores it + "id": entry.get("id", "na"), + "raman_text": raman_text, } return rec - -def main(): - p = argparse.ArgumentParser( - description="Build Alpaca train/test JSONs from a Raman spectroscopy dataset." - ) - p.add_argument("--raman-json", type=Path, required=True, - help="Path to Raman JSON file (list of entries).") - p.add_argument("--test-ratio", type=float, default=0.1, - help="Fraction for test split (default: 0.10).") - p.add_argument("--seed", type=int, default=42, - help="Random seed for the split (default: 42).") - p.add_argument("--train-out", type=Path, default=Path("alpaca_prop_train.json"), - help="Output path for train JSON.") - p.add_argument("--test-out", type=Path, default=Path("alpaca_prop_test.json"), - help="Output path for test JSON.") - p.add_argument("--freq-decimals", type=int, default=2, - help="Decimals for frequencies in cm^-1 (default: 2).") - p.add_argument("--activity-decimals", type=int, default=6, - help="Decimals for Raman activities (default: 6).") - args = p.parse_args() - - with args.raman_json.open("r", encoding="utf-8") as f: - raw = json.load(f) - - records = [] - for entry in tqdm(raw, total=len(raw), desc="Processing Raman entries"): - rec = make_raman_record( - entry, - freq_decimals=args.freq_decimals, - activity_decimals=args.activity_decimals, - ) - if rec is not None: - records.append(rec) - - if not records: - raise SystemExit("No valid records with nonzero Raman activity were found.") - - rng = random.Random(args.seed) - rng.shuffle(records) - n_total = len(records) - n_test = max(1, int(round(args.test_ratio * n_total))) - test = records[:n_test] - train = records[n_test:] - - dumpjson(data=train, filename=str(args.train_out)) - dumpjson(data=test, filename=str(args.test_out)) - - print(f"Wrote {len(train)} train records → {args.train_out}") - print(f"Wrote {len(test)} test records → {args.test_out}") - - # Quick compatibility check for the evaluator (needs id/input/output) - ex = test[0] - for k in ("id", "instruction", "input", "output"): - if k not in ex: - print(f"WARNING: key '{k}' missing from test example!") - - -if __name__ == "__main__": - main() - From 2425df924f24751635f0f7ebfa5fc1aa4459599e Mon Sep 17 00:00:00 2001 From: Charles Campbell Date: Fri, 31 Oct 2025 14:49:48 -0400 Subject: [PATCH 10/50] rest of the script --- atomgpt/scripts/ramangpt/make_raman_alpaca.py | 133 ++++++++++++++++-- 1 file changed, 122 insertions(+), 11 deletions(-) diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py index dd2e8b4..ffa1a4c 100644 --- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py +++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py @@ -1,3 +1,50 @@ +#!/usr/bin/env python3 +# make_raman_alpaca.py +# +# Read Raman JSON, keep only modes whose activity is non-zero AFTER rounding +# to the requested precision (so "0.000000" modes are dropped), format values, +# and write Alpaca-style train/test JSONs compatible with your finetuning script. + +import argparse +import json +import random +from pathlib import Path + +import numpy as np +from tqdm import tqdm +from jarvis.core.atoms import Atoms +from jarvis.db.jsonutils import dumpjson + + +def get_crystal_string_t(atoms: Atoms) -> str: + lengths = atoms.lattice.abc + angles = atoms.lattice.angles + atom_ids = atoms.elements + frac_coords = atoms.frac_coords + crystal_str = ( + " ".join("{0:.2f}".format(x) for x in lengths) + + "\n" + + " ".join(str(int(x)) for x in angles) + + "\n" + + "\n".join( + f"{t} " + " ".join("{0:.3f}".format(x) for x in c) + for t, c in zip(atom_ids, frac_coords) + ) + ) + return crystal_str + + +def format_fixed_decimals(val: float, decimals: int = 6) -> str: + """Format a number with fixed decimal places (handles scientific-notation inputs).""" + try: + v = float(val) + except Exception: + return "0" + if not np.isfinite(v): + return "0" + return f"{v:.{decimals}f}" + + def make_raman_record( entry: dict, freq_decimals: int = 2, @@ -24,25 +71,29 @@ def make_raman_record( if freqs.size == 0 or acts.size == 0 or freqs.size != acts.size: return None - # NEW: drop NaN/inf, then drop anything that *rounds* to 0.00... at the chosen precision + # Drop non-finite, then exclude anything that *appears* as 0.00... after rounding acts = np.where(np.isfinite(acts), acts, 0.0) - rounded = np.round(acts, decimals=activity_decimals) - mask = rounded != 0.0 # this also excludes "-0.00" + acts_rounded = np.round(acts, decimals=activity_decimals) + keep_mask = acts_rounded != 0.0 # also drops "-0.0" - if not np.any(mask): + if not np.any(keep_mask): return None - freqs_nz = freqs[mask] - acts_nz = acts[mask] + freqs_kept = freqs[keep_mask] + acts_kept = acts[keep_mask] + acts_rounded_kept = acts_rounded[keep_mask] - order = np.argsort(freqs_nz) - freqs_nz = freqs_nz[order] - acts_nz = acts_nz[order] + # Sort by frequency + order = np.argsort(freqs_kept) + freqs_kept = freqs_kept[order] + acts_kept = acts_kept[order] + acts_rounded_kept = acts_rounded_kept[order] + # Format output strings fmt_f = f"{{0:.{freq_decimals}f}}" pairs = [ - f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act), activity_decicals)})" - for freq, act in zip(freqs_nz, acts_nz) + f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act_r), activity_decimals)})" + for freq, act_r in zip(freqs_kept, acts_rounded_kept) ] raman_text = ", ".join(pairs) @@ -59,3 +110,63 @@ def make_raman_record( } return rec + +def main(): + p = argparse.ArgumentParser( + description="Build Alpaca train/test JSONs from a Raman spectroscopy dataset." + ) + p.add_argument("--raman-json", type=Path, required=True, + help="Path to Raman JSON file (list of entries).") + p.add_argument("--test-ratio", type=float, default=0.1, + help="Fraction for test split (default: 0.10).") + p.add_argument("--seed", type=int, default=42, + help="Random seed for the split (default: 42).") + p.add_argument("--train-out", type=Path, default=Path("alpaca_prop_train.json"), + help="Output path for train JSON.") + p.add_argument("--test-out", type=Path, default=Path("alpaca_prop_test.json"), + help="Output path for test JSON.") + p.add_argument("--freq-decimals", type=int, default=2, + help="Decimals for frequencies in cm^-1 (default: 2).") + p.add_argument("--activity-decimals", type=int, default=6, + help="Decimals for Raman activities (default: 6).") + args = p.parse_args() + + with args.raman_json.open("r", encoding="utf-8") as f: + raw = json.load(f) + + records = [] + for entry in tqdm(raw, total=len(raw), desc="Processing Raman entries"): + rec = make_raman_record( + entry, + freq_decimals=args.freq_decimals, + activity_decimals=args.activity_decimals, + ) + if rec is not None: + records.append(rec) + + if not records: + raise SystemExit("No valid records with nonzero Raman activity (after rounding) were found.") + + rng = random.Random(args.seed) + rng.shuffle(records) + n_total = len(records) + n_test = max(1, int(round(args.test_ratio * n_total))) + test = records[:n_test] + train = records[n_test:] + + dumpjson(data=train, filename=str(args.train_out)) + dumpjson(data=test, filename=str(args.test_out)) + + print(f"Wrote {len(train)} train records → {args.train_out}") + print(f"Wrote {len(test)} test records → {args.test_out}") + + # Quick compatibility check for the evaluator (needs id/input/output) + ex = test[0] + for k in ("id", "instruction", "input", "output"): + if k not in ex: + print(f"WARNING: key '{k}' missing from test example!") + + +if __name__ == "__main__": + main() + From 9f0b3c2a4607303ae8cc408cf0cce44c2093bb32 Mon Sep 17 00:00:00 2001 From: Charles Campbell Date: Fri, 31 Oct 2025 15:35:15 -0400 Subject: [PATCH 11/50] freq normal and niggli reduce --- atomgpt/scripts/ramangpt/make_raman_alpaca.py | 86 +++++++++++++++---- 1 file changed, 70 insertions(+), 16 deletions(-) diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py index ffa1a4c..12338fa 100644 --- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py +++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py @@ -1,9 +1,10 @@ #!/usr/bin/env python3 # make_raman_alpaca.py # -# Read Raman JSON, keep only modes whose activity is non-zero AFTER rounding -# to the requested precision (so "0.000000" modes are dropped), format values, -# and write Alpaca-style train/test JSONs compatible with your finetuning script. +# Read Raman JSON, optionally Niggli-reduce cells, keep only modes whose activity +# is non-zero AFTER rounding to the requested precision, optionally normalize +# frequencies to [0,1] (1.0 = max kept freq), format values, and write +# Alpaca-style train/test JSONs. import argparse import json @@ -34,6 +35,31 @@ def get_crystal_string_t(atoms: Atoms) -> str: return crystal_str +def niggli_reduce_atoms(atoms: Atoms) -> Atoms: + """ + Try to Niggli-reduce using pymatgen (preferred). + Falls back to returning the original atoms if reduction fails or pymatgen is absent. + """ + try: + from pymatgen.core import Structure, Lattice # lazy import + species = list(atoms.elements) # per-site symbols + frac = np.array(atoms.frac_coords, dtype=float) + lat = np.array(atoms.lattice.matrix, dtype=float) + pmg = Structure(Lattice(lat), species, frac, coords_are_cartesian=False) + + # Niggli reduction on the full structure (updates lattice + fractional coords) + reduced, _ = pmg.get_reduced_structure(reduction_algo="niggli") + return Atoms( + lattice_mat=np.array(reduced.lattice.matrix), + coords=np.array(reduced.frac_coords), + elements=[str(s) for s in reduced.species], + cartesian=False, + ) + except Exception: + # Best-effort fallback: return original if anything goes wrong + return atoms + + def format_fixed_decimals(val: float, decimals: int = 6) -> str: """Format a number with fixed decimal places (handles scientific-notation inputs).""" try: @@ -49,6 +75,8 @@ def make_raman_record( entry: dict, freq_decimals: int = 2, activity_decimals: int = 6, + normalize_freq: bool = False, + niggli: bool = False, ) -> dict | None: atoms_dict = entry.get("atoms") if not atoms_dict: @@ -59,6 +87,10 @@ def make_raman_record( except Exception: return None + # Optional Niggli reduction BEFORE anything else + if niggli: + atoms = niggli_reduce_atoms(atoms) + try: formula = atoms.composition.reduced_formula except Exception: @@ -80,30 +112,45 @@ def make_raman_record( return None freqs_kept = freqs[keep_mask] - acts_kept = acts[keep_mask] acts_rounded_kept = acts_rounded[keep_mask] - # Sort by frequency - order = np.argsort(freqs_kept) - freqs_kept = freqs_kept[order] - acts_kept = acts_kept[order] + # Optional normalize frequencies to [0,1], with 1.0 = max kept frequency + if normalize_freq: + max_f = float(np.max(freqs_kept)) if freqs_kept.size else 0.0 + if max_f > 0.0: + freqs_display = freqs_kept / max_f # zero maps to 0.0, max -> 1.0 + else: + freqs_display = np.zeros_like(freqs_kept) + freq_unit_caption = "normalized frequency 0–1" + else: + freqs_display = freqs_kept + freq_unit_caption = "cm^-1" + + # Sort by *display* frequency so ordering matches what we print + order = np.argsort(freqs_display) + freqs_display = freqs_display[order] + freqs_kept = freqs_kept[order] # keep original too, in case needed later acts_rounded_kept = acts_rounded_kept[order] # Format output strings fmt_f = f"{{0:.{freq_decimals}f}}" pairs = [ - f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act_r), activity_decimals)})" - for freq, act_r in zip(freqs_kept, acts_rounded_kept) + f"{fmt_f.format(float(fd))} ({format_fixed_decimals(float(act_r), activity_decimals)})" + for fd, act_r in zip(freqs_display, acts_rounded_kept) ] raman_text = ", ".join(pairs) + # Build prompt text + input_header = ( + f"The chemical formula is: {formula}.\n" + f"The Raman spectrum shows active modes in {freq_unit_caption} " + f"with normalized intensities () at: {raman_text}.\n" + f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types." + ) + rec = { "instruction": "Below is a description of a material.", - "input": ( - f"The chemical formula is: {formula}.\n" - f"The Raman spectrum shows active modes in cm^-1 with normalized intensities () at: {raman_text}.\n" - f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types." - ), + "input": input_header, "output": get_crystal_string_t(atoms), "id": entry.get("id", "na"), "raman_text": raman_text, @@ -126,9 +173,13 @@ def main(): p.add_argument("--test-out", type=Path, default=Path("alpaca_prop_test.json"), help="Output path for test JSON.") p.add_argument("--freq-decimals", type=int, default=2, - help="Decimals for frequencies in cm^-1 (default: 2).") + help="Decimals for frequencies (cm^-1 or normalized), default: 2.") p.add_argument("--activity-decimals", type=int, default=6, help="Decimals for Raman activities (default: 6).") + p.add_argument("--normalize-freq", action="store_true", + help="Normalize frequencies to [0,1]; 1.0 = max kept frequency after intensity rounding.") + p.add_argument("--niggli-reduce", action="store_true", + help="Apply Niggli reduction to each cell before partitioning into train/test.") args = p.parse_args() with args.raman_json.open("r", encoding="utf-8") as f: @@ -140,6 +191,8 @@ def main(): entry, freq_decimals=args.freq_decimals, activity_decimals=args.activity_decimals, + normalize_freq=args.normalize_freq, + niggli=args.niggli_reduce, ) if rec is not None: records.append(rec) @@ -147,6 +200,7 @@ def main(): if not records: raise SystemExit("No valid records with nonzero Raman activity (after rounding) were found.") + # Shuffle & split AFTER optional Niggli reduction (as requested) rng = random.Random(args.seed) rng.shuffle(records) n_total = len(records) From 2771adf70cc4ae5f2a49d66fc56e8ff0ec71a3c8 Mon Sep 17 00:00:00 2001 From: Charles Campbell Date: Tue, 4 Nov 2025 00:21:22 -0500 Subject: [PATCH 12/50] add freq upper bound --- atomgpt/scripts/ramangpt/make_raman_alpaca.py | 61 ++++++++++++++----- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py index 12338fa..8cfd65b 100644 --- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py +++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py @@ -18,21 +18,37 @@ def get_crystal_string_t(atoms: Atoms) -> str: - lengths = atoms.lattice.abc - angles = atoms.lattice.angles - atom_ids = atoms.elements - frac_coords = atoms.frac_coords - crystal_str = ( - " ".join("{0:.2f}".format(x) for x in lengths) - + "\n" - + " ".join(str(int(x)) for x in angles) - + "\n" - + "\n".join( - f"{t} " + " ".join("{0:.3f}".format(x) for x in c) - for t, c in zip(atom_ids, frac_coords) - ) + # Lattice + lengths = np.array(atoms.lattice.abc, dtype=float).ravel() + angles = np.array(atoms.lattice.angles, dtype=float).ravel() + + # Per-site species and fractional coordinates; force shape (N, 3) + atom_ids = [str(x) for x in list(atoms.elements)] + frac = np.asarray(atoms.frac_coords, dtype=float) + if frac.ndim == 1: + if frac.size == 3: + frac = frac.reshape(1, 3) + else: + raise ValueError(f"Unexpected fractional coord shape: {frac.shape}") + elif frac.ndim == 2 and frac.shape[1] != 3: + raise ValueError(f"Expected frac coords with 3 columns, got {frac.shape}") + + # If species length doesn't match coords, broadcast a single species tag + if len(atom_ids) != len(frac): + if len(atom_ids) == 1 and len(frac) > 1: + atom_ids = atom_ids * len(frac) + else: + raise ValueError( + f"Elements length ({len(atom_ids)}) != coords length ({len(frac)})" + ) + + lengths_str = " ".join(f"{x:.2f}" for x in lengths.tolist()) + angles_str = " ".join(f"{x:.2f}" for x in angles.tolist()) + coords_str = "\n".join( + f"{t} " + " ".join(f"{c:.3f}" for c in row.tolist()) + for t, row in zip(atom_ids, frac) ) - return crystal_str + return f"{lengths_str}\n{angles_str}\n{coords_str}" def niggli_reduce_atoms(atoms: Atoms) -> Atoms: @@ -77,6 +93,7 @@ def make_raman_record( activity_decimals: int = 6, normalize_freq: bool = False, niggli: bool = False, + include_max_freq: bool = False, ) -> dict | None: atoms_dict = entry.get("atoms") if not atoms_dict: @@ -115,8 +132,8 @@ def make_raman_record( acts_rounded_kept = acts_rounded[keep_mask] # Optional normalize frequencies to [0,1], with 1.0 = max kept frequency + max_f = float(np.max(freqs_kept)) if freqs_kept.size else 0.0 if normalize_freq: - max_f = float(np.max(freqs_kept)) if freqs_kept.size else 0.0 if max_f > 0.0: freqs_display = freqs_kept / max_f # zero maps to 0.0, max -> 1.0 else: @@ -141,10 +158,17 @@ def make_raman_record( raman_text = ", ".join(pairs) # Build prompt text + extra_norm_line = "" + if normalize_freq and include_max_freq and max_f > 0.0: + extra_norm_line = ( + f"\nNormalization reference: 1.00 corresponds to " + f"{fmt_f.format(max_f)} cm^-1." + ) + input_header = ( f"The chemical formula is: {formula}.\n" f"The Raman spectrum shows active modes in {freq_unit_caption} " - f"with normalized intensities () at: {raman_text}.\n" + f"with normalized intensities () at: {raman_text}.{extra_norm_line}\n" f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types." ) @@ -155,6 +179,8 @@ def make_raman_record( "id": entry.get("id", "na"), "raman_text": raman_text, } + if normalize_freq and include_max_freq: + rec["max_freq_cm"] = float(max_f) return rec @@ -178,6 +204,8 @@ def main(): help="Decimals for Raman activities (default: 6).") p.add_argument("--normalize-freq", action="store_true", help="Normalize frequencies to [0,1]; 1.0 = max kept frequency after intensity rounding.") + p.add_argument("--include-max-freq", action="store_true", + help="When used with --normalize-freq, include the unnormalized max frequency (that maps to 1.0) in the prompt.") p.add_argument("--niggli-reduce", action="store_true", help="Apply Niggli reduction to each cell before partitioning into train/test.") args = p.parse_args() @@ -193,6 +221,7 @@ def main(): activity_decimals=args.activity_decimals, normalize_freq=args.normalize_freq, niggli=args.niggli_reduce, + include_max_freq=args.include_max_freq, ) if rec is not None: records.append(rec) From 69c3463c4f141a11453129c92cc098b89b92e12e Mon Sep 17 00:00:00 2001 From: crhysc Date: Thu, 13 Nov 2025 13:56:05 -0500 Subject: [PATCH 13/50] initial commit --- atomgpt/inverse_models/gpt_oss.py | 154 ++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 atomgpt/inverse_models/gpt_oss.py diff --git a/atomgpt/inverse_models/gpt_oss.py b/atomgpt/inverse_models/gpt_oss.py new file mode 100644 index 0000000..dc6a395 --- /dev/null +++ b/atomgpt/inverse_models/gpt_oss.py @@ -0,0 +1,154 @@ +from atomgpt.inverse_models.llama import * # noqa: F401,F403 +import os + +from atomgpt.inverse_models._utils import __version__ # noqa: F401 +from atomgpt.inverse_models._utils2 import Version, _get_dtype # noqa: F401 + +try: + # New HF GPT-OSS modeling API + from transformers.models.gpt_oss.modeling_gpt_oss import ( + GptOssModel, + GptOssForCausalLM, + ) +except Exception as exc: # pragma: no cover + raise ImportError( + "AtomGPT: transformers installation does not appear to include " + "the `gpt_oss` model. Please upgrade transformers:\n" + ' pip install --upgrade "transformers"\n' + "and ensure you are on a release that supports GPT-OSS." + ) from exc + + +# Convenience list of all 4 Unsloth GPT-OSS models that are supported via +# FastLanguageModel.from_pretrained(..., model_name=...). +# +# You can use these as drop-in `model_name` values: +# +# from atomgpt.inverse_models.gpt_oss import UNSLOTH_GPT_OSS_MODELS +# model_name = UNSLOTH_GPT_OSS_MODELS[0] +# +UNSLOTH_GPT_OSS_MODELS = [ + # BitsAndBytes 4bit Unsloth quantizations + "unsloth/gpt-oss-20b-unsloth-bnb-4bit", + "unsloth/gpt-oss-120b-unsloth-bnb-4bit", + # MXFP4 “original” weights that Unsloth wraps + "unsloth/gpt-oss-20b", + "unsloth/gpt-oss-120b", +] + + +def _log_once(msg: str) -> None: + """Tiny helper to avoid spamming logs if imported multiple times.""" + if getattr(_log_once, "_seen", None) is None: + _log_once._seen = set() + if msg in _log_once._seen: + return + _log_once._seen.add(msg) + print(msg) + + +class FastGptOssModel(FastLlamaModel): + """ + Fast GPT-OSS integration for AtomGPT. + + This mirrors the overall structure of `FastMistralModel` but takes a more + conservative approach: + + * We **do not** override GPT-OSS attention / MoE internals. Those are + handled by the upstream `transformers` implementation and whatever + `unsloth_compile_transformers` is already doing in your loader. + * We **do**: + - patch PEFT `PeftModelForCausalLM.forward` to the same fast path + that LLaMA / Mistral use. + - patch `prepare_inputs_for_generation` for `GptOssForCausalLM`, so + that your generation path stays consistent with LLaMA / Mistral. + * Everything else is delegated to `FastLlamaModel.from_pretrained` with + `model_patcher=FastGptOssModel`, to keep the hierarchy uniform. + """ + + @staticmethod + def pre_patch(): + """ + Apply GPT-OSS-specific patches. + + We deliberately do **not** touch GPT-OSS attention / decoder layer + implementations here, to avoid shape / MoE wiring mistakes. Instead we + reuse only the architecture-agnostic bits from `llama.py`. + """ + # Reuse the PEFT fast forward path (architecture-agnostic: it only + # assumes a CausalLM head with `.lm_head`). + global PeftModelForCausalLM # imported from llama.py via * + PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward + + # Reuse the generation input patcher so GPT-OSS works with your + # custom `prepare_inputs_for_generation` handling (past-key-values, + # sliding window, etc.), analogous to Mistral / LLaMA. + fix_prepare_inputs_for_generation(GptOssForCausalLM) + + _log_once( + "AtomGPT: Patched GPT-OSS (GptOssForCausalLM + PEFT) for " + "FastLanguageModel integration." + ) + return + + @staticmethod + def from_pretrained( + model_name: str = "unsloth/gpt-oss-20b-unsloth-bnb-4bit", + max_seq_length: int | None = None, + dtype=None, + load_in_4bit: bool = True, + token=None, + device_map: str | dict = "sequential", + rope_scaling=None, # GPT-OSS does not use classic RoPE scaling, kept for API symmetry + fix_tokenizer: bool = True, + model_patcher=None, + tokenizer_name: str | None = None, + trust_remote_code: bool = False, + **kwargs, + ): + """ + Thin wrapper around `FastLlamaModel.from_pretrained`. + + The important part is that we pass `model_patcher=FastGptOssModel`, + which causes: + + * `FastGptOssModel.pre_patch()` to run before loading. + * All the Unsloth / AtomGPT compile + quantization machinery to be + reused exactly as for LLaMA / Mistral. + + Usage (drop-in with your loader): + + from atomgpt.inverse_models.loader import FastLanguageModel + + model, tokenizer = FastLanguageModel.from_pretrained( + model_name="unsloth/gpt-oss-20b-unsloth-bnb-4bit", + max_seq_length=2048, + dtype=None, + load_in_4bit=True, + ) + """ + # Defer to the LLaMA machinery – it will: + # * call FastGptOssModel.pre_patch() + # * run unsloth_compile_transformers + # * handle bitsandbytes / 4bit / 8bit / PEFT, etc. + return FastLlamaModel.from_pretrained( + model_name=model_name, + max_seq_length=max_seq_length, + dtype=dtype, + load_in_4bit=load_in_4bit, + token=token, + device_map=device_map, + rope_scaling=rope_scaling, + fix_tokenizer=fix_tokenizer, + model_patcher=FastGptOssModel if model_patcher is None else model_patcher, + tokenizer_name=tokenizer_name, + trust_remote_code=trust_remote_code, + **kwargs, + ) + + +__all__ = [ + "FastGptOssModel", + "UNSLOTH_GPT_OSS_MODELS", +] + From 1d151572bc7c5248759bb1079ed1d562418e1736 Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Thu, 13 Nov 2025 14:02:14 -0500 Subject: [PATCH 14/50] Update loader.py --- atomgpt/inverse_models/loader.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/atomgpt/inverse_models/loader.py b/atomgpt/inverse_models/loader.py index 2a66eeb..e473adc 100644 --- a/atomgpt/inverse_models/loader.py +++ b/atomgpt/inverse_models/loader.py @@ -6,6 +6,7 @@ USE_MODELSCOPE, get_transformers_model_type, ) +from atomgpt.inverse_models.gpt_oss import FastGptOssModel from atomgpt.inverse_models.granite import FastGraniteModel from atomgpt.inverse_models.llama import FastLlamaModel, logger from atomgpt.inverse_models.mistral import FastMistralModel @@ -44,6 +45,7 @@ SUPPORTS_GRANITE = transformers_version >= Version("4.46.0") SUPPORTS_QWEN3 = transformers_version >= Version("4.50.3") SUPPORTS_QWEN3_MOE = transformers_version >= Version("4.50.3") +SUPPORTS_GPT_OSS = transformers_version >= Version("4.55.0") if SUPPORTS_GEMMA: from atomgpt.inverse_models.gemma import FastGemmaModel if SUPPORTS_GEMMA2: @@ -294,6 +296,17 @@ def from_pretrained( f"to obtain the latest transformers build, then restart this session." ) dispatch_model = FastGemmaModel + elif model_type == "gpt_oss": + if not SUPPORTS_GPT_OSS: + raise ImportError( + f"AtomGPT: Your transformers version of {transformers_version} " + f"does not support GPT-OSS.\n" + f"The minimum required version is 4.55.0.\n" + f'Try `pip install --upgrade "transformers>=4.55.0"`\n' + f"to obtain the latest compatible transformers build, then " + f"restart this session." + ) + dispatch_model = FastGptOssModel elif model_type == "gemma2": if not SUPPORTS_GEMMA2: raise ImportError( From 8ff5caee7d75264d511ddeaffd8ba0752f1248e4 Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Wed, 19 Nov 2025 15:03:22 -0500 Subject: [PATCH 15/50] patch if load_in_4bit --- atomgpt/inverse_models/loader.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/atomgpt/inverse_models/loader.py b/atomgpt/inverse_models/loader.py index e473adc..4f28327 100644 --- a/atomgpt/inverse_models/loader.py +++ b/atomgpt/inverse_models/loader.py @@ -918,14 +918,27 @@ def from_pretrained( ] ) pass - + if load_in_4bit: - # Fix up bitsandbytes config + # Fix up bitsandbytes config, robust to missing torch_dtype/dtype. + cfg_dict = model.config.to_dict() + + compute_dtype = cfg_dict.get("torch_dtype", None) + if compute_dtype is None: + # Newer configs may use "dtype" instead + compute_dtype = cfg_dict.get("dtype", None) + + # Fall back to the user-specified dtype or a sensible default + if compute_dtype is None: + compute_dtype = _get_dtype(dtype) # imported above + + # Sometimes this is a string like "float16" – map to torch dtype + import torch + if isinstance(compute_dtype, str): + compute_dtype = getattr(torch, compute_dtype, torch.float16) + quantization_config = { - # Sometimes torch_dtype is not a string!! - "bnb_4bit_compute_dtype": model.config.to_dict()[ - "torch_dtype" - ], + "bnb_4bit_compute_dtype": compute_dtype, "bnb_4bit_quant_type": "nf4", "bnb_4bit_use_double_quant": True, "llm_int8_enable_fp32_cpu_offload": False, From 1cb88ef0acdae0985c42b3d69c18c062c5d5c1c0 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Wed, 19 Nov 2025 16:42:46 -0500 Subject: [PATCH 16/50] upgrade the required transformers version to 4.57.1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 33985ee..7249477 100644 --- a/requirements.txt +++ b/requirements.txt @@ -104,7 +104,7 @@ toolz==1.0.0 torch==2.7.0 torchvision==0.22.0 tqdm==4.67.1 -transformers==4.51.3 +transformers==4.57.1 triton==3.3.0 trl==0.15.2 typeguard==4.4.2 From 69226ee0c0be20e1898cdfb93d0195f70beef6e5 Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:14:22 -0500 Subject: [PATCH 17/50] add _get_dtype(dtype) to line 466 --- atomgpt/inverse_models/loader.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/atomgpt/inverse_models/loader.py b/atomgpt/inverse_models/loader.py index 4f28327..0a501ee 100644 --- a/atomgpt/inverse_models/loader.py +++ b/atomgpt/inverse_models/loader.py @@ -461,12 +461,12 @@ def from_pretrained( pass if load_in_4bit: - # Fix up bitsandbytes config + # Fix up bitsandbytes config, robust to missing torch_dtype. + # Use the same helper we use elsewhere. + compute_dtype = _get_dtype(dtype) # falls back to bf16/fp16 based on hardware + quantization_config = { - # Sometimes torch_dtype is not a string!! - "bnb_4bit_compute_dtype": model.config.to_dict()[ - "torch_dtype" - ], + "bnb_4bit_compute_dtype": compute_dtype, "bnb_4bit_quant_type": "nf4", "bnb_4bit_use_double_quant": True, "llm_int8_enable_fp32_cpu_offload": False, @@ -478,6 +478,7 @@ def from_pretrained( "quant_method": "bitsandbytes", } model.config.update({"quantization_config": quantization_config}) + pass if is_peft: From f6efda5cde7a0379c21c41381412bd57cf54c7b6 Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:36:55 -0500 Subject: [PATCH 18/50] add gpt_oss to def patch_peft_model() --- atomgpt/inverse_models/llama.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/atomgpt/inverse_models/llama.py b/atomgpt/inverse_models/llama.py index f647a75..053f15e 100644 --- a/atomgpt/inverse_models/llama.py +++ b/atomgpt/inverse_models/llama.py @@ -3141,6 +3141,13 @@ def patch_peft_model( apply_lora_mlp = apply_lora_mlp_swiglu elif model_type == "qwen3moe": apply_lora_mlp = apply_lora_mlp_swiglu + elif model_type == "gpt_oss": + if use_gradient_checkpointing == "unsloth": + try: + model.gradient_checkpointing_enable() + except Exception: + pass + return model else: raise NotImplementedError( f"AtomGPT: {model_type} is not yet implemented!" From f1bab0ef9238bd60f8b854abf69d9e392544a567 Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:58:53 -0500 Subject: [PATCH 19/50] patch num_logits_to_keep --- atomgpt/inverse_models/llama.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/atomgpt/inverse_models/llama.py b/atomgpt/inverse_models/llama.py index 053f15e..6f93e28 100644 --- a/atomgpt/inverse_models/llama.py +++ b/atomgpt/inverse_models/llama.py @@ -2012,11 +2012,28 @@ def unsloth_fast_generate( # For newer HF kwargs["cache_implementation"] = "dynamic" - # For num_logits_to_keep - num_logits_to_keep = kwargs.get("num_logits_to_keep", None) - logits_to_keep = kwargs.get("logits_to_keep", None) - if num_logits_to_keep is None and logits_to_keep is None: - kwargs["num_logits_to_keep"] = 1 + + # For num_logits_to_keep: only use it if the model forward actually supports it + import inspect + + try: + forward_sig = inspect.signature(self.forward) + supports_num_logits = ( + "num_logits_to_keep" in forward_sig.parameters + or "logits_to_keep" in forward_sig.parameters + ) + except (TypeError, ValueError): + supports_num_logits = False + + if supports_num_logits: + num_logits_to_keep = kwargs.get("num_logits_to_keep", None) + logits_to_keep = kwargs.get("logits_to_keep", None) + if num_logits_to_keep is None and logits_to_keep is None: + kwargs["num_logits_to_keep"] = 1 + else: + # Make sure we don't pass these through to HF generate for models that don't support them + kwargs.pop("num_logits_to_keep", None) + kwargs.pop("logits_to_keep", None) # Remove token_type_ids kwargs.pop("token_type_ids", None) From 7d3d37b2ab39820553f1809883e7bc37fcae6fff Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Thu, 20 Nov 2025 18:01:50 -0500 Subject: [PATCH 20/50] strip num_logits_to_keep in unsloth_fast_generate() for gpt-oss models --- atomgpt/inverse_models/llama.py | 44 +++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/atomgpt/inverse_models/llama.py b/atomgpt/inverse_models/llama.py index 6f93e28..6f29d0a 100644 --- a/atomgpt/inverse_models/llama.py +++ b/atomgpt/inverse_models/llama.py @@ -2013,27 +2013,39 @@ def unsloth_fast_generate( # For newer HF kwargs["cache_implementation"] = "dynamic" - # For num_logits_to_keep: only use it if the model forward actually supports it + # --- Handle num_logits_to_keep / logits_to_keep safely per model type --- import inspect - try: - forward_sig = inspect.signature(self.forward) - supports_num_logits = ( - "num_logits_to_keep" in forward_sig.parameters - or "logits_to_keep" in forward_sig.parameters - ) - except (TypeError, ValueError): - supports_num_logits = False + model_type = getattr(getattr(self, "config", None), "model_type", None) - if supports_num_logits: - num_logits_to_keep = kwargs.get("num_logits_to_keep", None) - logits_to_keep = kwargs.get("logits_to_keep", None) - if num_logits_to_keep is None and logits_to_keep is None: - kwargs["num_logits_to_keep"] = 1 - else: - # Make sure we don't pass these through to HF generate for models that don't support them + # GPT-OSS does *not* advertise these kwargs in its generate/forward stack, and + # passing them causes HF `generate` → `_validate_model_kwargs` to raise: + # ValueError: The following `model_kwargs` are not used by the model: ['num_logits_to_keep'] + # So for GPT-OSS we always strip them. + if model_type == "gpt_oss": kwargs.pop("num_logits_to_keep", None) kwargs.pop("logits_to_keep", None) + else: + # For other models (llama, mistral, etc.), keep Unsloth's optimization: + # only use num_logits_to_keep/logits_to_keep if the model forward supports them. + try: + forward_sig = inspect.signature(self.forward) + supports_num_logits = ( + "num_logits_to_keep" in forward_sig.parameters + or "logits_to_keep" in forward_sig.parameters + ) + except (TypeError, ValueError): + supports_num_logits = False + + if supports_num_logits: + num_logits_to_keep = kwargs.get("num_logits_to_keep", None) + logits_to_keep = kwargs.get("logits_to_keep", None) + if num_logits_to_keep is None and logits_to_keep is None: + # Enable Unsloth's memory optimization for compatible models + kwargs["num_logits_to_keep"] = 1 + else: + kwargs.pop("num_logits_to_keep", None) + kwargs.pop("logits_to_keep", None) # Remove token_type_ids kwargs.pop("token_type_ids", None) From c02d6a5f66d527e7b5dc7c1edab050a31a76ecdb Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Thu, 20 Nov 2025 18:50:45 -0500 Subject: [PATCH 21/50] Update gpt_oss.py --- atomgpt/inverse_models/gpt_oss.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/atomgpt/inverse_models/gpt_oss.py b/atomgpt/inverse_models/gpt_oss.py index dc6a395..67e29a0 100644 --- a/atomgpt/inverse_models/gpt_oss.py +++ b/atomgpt/inverse_models/gpt_oss.py @@ -18,6 +18,32 @@ "and ensure you are on a release that supports GPT-OSS." ) from exc +# --- AtomGPT: fix GPT-OSS position_ids shape for rotary embeddings --- +# Some fast-generation paths may end up passing a 1D tensor for `position_ids` +# (shape [seq_len]), but GPT-OSS's rotary embeddings expect [batch, seq_len]. +# This wrapper upgrades 1D position_ids → [1, seq_len] to avoid IndexError. + +if not hasattr(GptOssModel, "_atomgpt_position_ids_patched"): + _original_gpt_oss_forward = GptOssModel.forward + + def _atomgpt_gpt_oss_forward(self, *args, **kwargs): + pos = kwargs.get("position_ids", None) + try: + if pos is not None and hasattr(pos, "dim") and pos.dim() == 1: + # [seq_len] -> [1, seq_len] + kwargs["position_ids"] = pos.unsqueeze(0) + except Exception: + # Best-effort: never let our fix be the thing that breaks. + pass + return _original_gpt_oss_forward(self, *args, **kwargs) + + GptOssModel.forward = _atomgpt_gpt_oss_forward + GptOssModel._atomgpt_position_ids_patched = True + + print( + "AtomGPT: Patched GptOssModel.forward to fix 1D position_ids for GPT-OSS rotary embeddings." + ) + # Convenience list of all 4 Unsloth GPT-OSS models that are supported via # FastLanguageModel.from_pretrained(..., model_name=...). From 078175d4347d2f60e61c248a734fef55db49ef2b Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Thu, 20 Nov 2025 18:57:32 -0500 Subject: [PATCH 22/50] patch pre_patch() --- atomgpt/inverse_models/gpt_oss.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/atomgpt/inverse_models/gpt_oss.py b/atomgpt/inverse_models/gpt_oss.py index 67e29a0..3355093 100644 --- a/atomgpt/inverse_models/gpt_oss.py +++ b/atomgpt/inverse_models/gpt_oss.py @@ -86,8 +86,9 @@ class FastGptOssModel(FastLlamaModel): * We **do**: - patch PEFT `PeftModelForCausalLM.forward` to the same fast path that LLaMA / Mistral use. - - patch `prepare_inputs_for_generation` for `GptOssForCausalLM`, so - that your generation path stays consistent with LLaMA / Mistral. + - (for now) leave `GptOssForCausalLM.prepare_inputs_for_generation` + untouched, because the LLaMA-style patch breaks GPT-OSS attention + shapes during sampling. * Everything else is delegated to `FastLlamaModel.from_pretrained` with `model_patcher=FastGptOssModel`, to keep the hierarchy uniform. """ @@ -106,17 +107,23 @@ def pre_patch(): global PeftModelForCausalLM # imported from llama.py via * PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward - # Reuse the generation input patcher so GPT-OSS works with your - # custom `prepare_inputs_for_generation` handling (past-key-values, - # sliding window, etc.), analogous to Mistral / LLaMA. - fix_prepare_inputs_for_generation(GptOssForCausalLM) + # IMPORTANT: + # Do NOT call `fix_prepare_inputs_for_generation(GptOssForCausalLM)` + # here. That patch is tailored to LLaMA/Mistral KV-cache semantics and + # causes attention shape mismatches for GPT-OSS (e.g. value_states + # ending up with seq_len = 1 instead of the full context length). + # + # We'll rely on the official transformers implementation of + # `prepare_inputs_for_generation` for GPT-OSS instead. + # fix_prepare_inputs_for_generation(GptOssForCausalLM) _log_once( - "AtomGPT: Patched GPT-OSS (GptOssForCausalLM + PEFT) for " - "FastLanguageModel integration." + "AtomGPT: Patched GPT-OSS (PEFT fast forward only; " + "using native prepare_inputs_for_generation)." ) return + @staticmethod def from_pretrained( model_name: str = "unsloth/gpt-oss-20b-unsloth-bnb-4bit", From 1be754cfda497192d42d0889f340d0c2a87cd5e3 Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:19:35 -0500 Subject: [PATCH 23/50] force progress bar --- atomgpt/inverse_models/inverse_models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index 9744286..753c303 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -593,7 +593,9 @@ def tokenize_function(example): num_train_epochs=config.num_epochs, save_strategy=config.save_strategy, save_steps=config.save_steps, - ), + disable_tqdm=False, + log_level="info", + ), ) if callback_samples > 0: callback = ExampleTrainerCallback( From 85d2f82768b12c3105c74876ec18f5e76f70c581 Mon Sep 17 00:00:00 2001 From: crhysc Date: Thu, 2 Oct 2025 16:24:41 -0400 Subject: [PATCH 24/50] add invalid structures error handling --- atomgpt/inverse_models/inverse_models.py | 117 +++++++++++++++++------ 1 file changed, 86 insertions(+), 31 deletions(-) diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index 753c303..b4cc788 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -27,6 +27,7 @@ from jarvis.io.vasp.inputs import Poscar import csv import os +import numpy as np from pydantic_settings import BaseSettings import sys import json @@ -211,42 +212,96 @@ def load_model(path="", config=None): FastLanguageModel.for_inference(model) return model, tokenizer, config +def _validate_atoms(atoms): + if atoms is None: + return False, "atoms_is_none" + try: + lat = np.asarray(getattr(atoms, "lattice_mat", None), dtype=float) + if lat.shape != (3, 3): + return False, f"bad_lattice_shape:{getattr(atoms,'lattice_mat',None)}" + if not np.isfinite(lat).all(): + return False, "nonfinite_lattice" + n = getattr(atoms, "num_atoms", None) + if n is None or n <= 0: + return False, f"num_atoms_invalid:{n}" + _ = Poscar(atoms).to_string() + return True, "" + except Exception as e: + return False, f"poscar_fail:{type(e).__name__}:{e}" + +def _poscar_one_line(at): + return Poscar(at).to_string().replace("\n", "\\n") + +def _misses_path(csv_out, config): + fname = getattr(config, "miss_csv", None) + if fname is None or not str(fname).strip(): + root, ext = os.path.splitext(csv_out) + fname = root + ".misses.csv" + os.makedirs(os.path.dirname(os.path.abspath(fname)), exist_ok=True) + return fname def evaluate( - test_set=[], model="", tokenizer="", csv_out="out.csv", config="" + test_set=[], + model="", + tokenizer="", + csv_out="out.csv", + config="", ): print("Testing\n", len(test_set)) - f = open(csv_out, "w") - f.write("id,target,prediction\n") + os.makedirs(os.path.dirname(os.path.abspath(csv_out)), exist_ok=True) + miss_csv_out = _misses_path(csv_out, config) + + with open(csv_out, "w", newline="") as f_ok, open(miss_csv_out, "w", newline="") as f_miss: + ok_writer = csv.writer(f_ok) + miss_writer = csv.writer(f_miss) + ok_writer.writerow(["id", "target", "prediction"]) + miss_writer.writerow(["id", "stage", "error", "detail", "raw_text_preview"]) + + for i in tqdm(test_set, total=len(test_set)): + sample_id = i.get("id", "") + target_mat = None + target_err = None + try: + target_mat = text2atoms("\n" + i["output"]) + ok, detail = _validate_atoms(target_mat) + if not ok: + target_err = detail + except Exception as e: + target_err = f"text2atoms:{type(e).__name__}:{e}" + + if target_err: + miss_writer.writerow([sample_id, "target", "invalid_target", target_err, (i.get("output","")[:240])]) + continue + + gen_mat = None + gen_err = None + try: + gen_mat = gen_atoms( + prompt=i["input"], + tokenizer=tokenizer, + model=model, + alpaca_prompt=config.alpaca_prompt, + instruction=config.instruction, + ) + ok, detail = _validate_atoms(gen_mat) + if not ok: + gen_err = detail + except Exception as e: + gen_err = f"gen_atoms:{type(e).__name__}:{e}" + + if gen_err: + miss_writer.writerow([sample_id, "prediction", "invalid_prediction", gen_err, ""]) + continue + + try: + ok_writer.writerow([ + sample_id, + _poscar_one_line(target_mat), + _poscar_one_line(gen_mat), + ]) + except Exception as e: + miss_writer.writerow([sample_id, "write", "write_failed", f"{type(e).__name__}:{e}", ""]) - for i in tqdm(test_set, total=len(test_set)): - # try: - # prompt = i["input"] - # print("prompt", prompt) - gen_mat = gen_atoms( - prompt=i["input"], - tokenizer=tokenizer, - model=model, - alpaca_prompt=config.alpaca_prompt, - instruction=config.instruction, - ) - target_mat = text2atoms("\n" + i["output"]) - print("target_mat", target_mat) - print("genmat", gen_mat) - line = ( - i["id"] - + "," - + Poscar(target_mat).to_string().replace("\n", "\\n") - + "," - + Poscar(gen_mat).to_string().replace("\n", "\\n") - + "\n" - ) - f.write(line) - # print() - # except Exception as exp: - # print("Error", exp) - # pass - f.close() def batch_evaluate( From 9552f4ea25f9276744c5676338547b94f5c46b5c Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Tue, 18 Nov 2025 10:40:57 -0500 Subject: [PATCH 25/50] add error handling for inverse_predict.py --- atomgpt/inverse_models/inverse_predict.py | 111 ++++++++++++++-------- 1 file changed, 71 insertions(+), 40 deletions(-) diff --git a/atomgpt/inverse_models/inverse_predict.py b/atomgpt/inverse_models/inverse_predict.py index 378ca35..003cc08 100644 --- a/atomgpt/inverse_models/inverse_predict.py +++ b/atomgpt/inverse_models/inverse_predict.py @@ -111,12 +111,9 @@ def predict( prop_val=None, dtype=None, max_seq_length=1058, - load_in_4bit=None, # temp_config["load_in_4bit"] - verbose=True, # temp_config["load_in_4bit"] + load_in_4bit=None, + verbose=True, ): - # if not os.path.exists("config_name"): - - # config_name=os.path.join(output_dir,"config.json") print("config_path", config_path) if output_dir is not None: config_name = os.path.join(output_dir, "config.json") @@ -125,7 +122,7 @@ def predict( config_name = os.path.join(parent, "config.json") adapter = os.path.join(output_dir, "adapter_config.json") if os.path.exists(adapter): - model_name = output_dir # temp_config["model_name"] + model_name = output_dir if config_path is not None: config_name = config_path if verbose: @@ -142,7 +139,6 @@ def predict( pprint.pprint(temp_config) if model_name is None: model_name = temp_config["model_name"] - # output_dir = temp_config["output_dir"] if load_in_4bit is None: load_in_4bit = temp_config["load_in_4bit"] @@ -150,6 +146,7 @@ def predict( print("Model used:", model_name) print("config used:", config_path) print("formula:", formula) + model = None tokenizer = None try: @@ -161,29 +158,28 @@ def predict( device_map="auto", ) FastLanguageModel.for_inference(model) - except: + except Exception: tokenizer = AutoTokenizer.from_pretrained( model_name, gguf_file=filename ) model = AutoModelForCausalLM.from_pretrained( model_name, gguf_file=filename ) - pass + atoms_arr = [] lines = [] if formula is None: - # if dat_path is None: - f = open(pred_csv, "r") - lines = f.read().splitlines() - f.close() + with open(pred_csv, "r") as f: + lines = f.read().splitlines() else: if dat_path is not None: lines = [dat_path] - lines = [formula] + else: + lines = [formula] mem = [] - for i in lines: + for idx, i in enumerate(lines): prompt = i if ".dat" in i or dat_path is not None: if dat_path is None: @@ -198,21 +194,18 @@ def predict( formula=formula, background_subs=background_subs, ) - # y[y < 0.1] = 0 - y_new_str = y # "\n".join(["{0:.2f}".format(x) for x in y]) + y_new_str = y try: if ".dat" in i: formula = str(_formula.split("/")[-1].split(".dat")[0]) except Exception: pass - # gen_mat = main_spectra(spectra=[[y_new_str,y]],formulas=[formula],model=model,tokenizer=tokenizer,device='cuda')[0] prompt = ( "The chemical formula is " + formula + " The " + temp_config["prop"] + " is " - # + " The XRD is " + y_new_str + ". Generate atomic structure description with lattice lengths, angles, coordinates and atom types." ) @@ -224,34 +217,72 @@ def predict( + " The " + temp_config["prop"] + " is " - # + " The XRD is " + str(prop_val) + ". Generate atomic structure description with lattice lengths, angles, coordinates and atom types." ) if verbose: - print("prompt here", prompt.replace("\n", ",")) - gen_mat = gen_atoms( - prompt=prompt, - model=model, - tokenizer=tokenizer, - alpaca_prompt=temp_config["alpaca_prompt"], - instruction=temp_config["instruction"], - device=device, - ) - if verbose: - print("gen atoms", gen_mat) - print("gen atoms spacegroup", gen_mat.spacegroup()) - print("intvl", intvl) - if relax: - gen_mat = relax_atoms(atoms=gen_mat) + print(f"[{idx}] prompt:", prompt.replace("\n", ",")) + + info = {"prompt": prompt} + gen_mat = None + + # --- NEW: robust error handling around generation / structure use --- + try: + gen_mat = gen_atoms( + prompt=prompt, + model=model, + tokenizer=tokenizer, + alpaca_prompt=temp_config["alpaca_prompt"], + instruction=temp_config["instruction"], + device=device, + ) + if verbose: - print("gen atoms relax", gen_mat, gen_mat.spacegroup()) - atoms_arr.append(gen_mat.to_dict()) - info = {} - info["prompt"] = prompt - info["atoms"] = gen_mat.to_dict() + print(f"[{idx}] gen atoms:", gen_mat) + # spacegroup() can fail for broken structures, so guard it + try: + print(f"[{idx}] gen atoms spacegroup:", gen_mat.spacegroup()) + except Exception as e_sg: + print( + f"[WARN] Failed to compute spacegroup for sample {idx}: {e_sg}" + ) + + if relax: + try: + gen_mat = relax_atoms(atoms=gen_mat) + if verbose: + print( + f"[{idx}] gen atoms relax:", + gen_mat, + gen_mat.spacegroup(), + ) + except Exception as e_relax: + print( + f"[WARN] Relaxation failed for sample {idx}, " + "continuing with unrelaxed structure." + ) + print(traceback.format_exc()) + + # this is another common crash point if gen_mat is invalid + atoms_dict = gen_mat.to_dict() + atoms_arr.append(atoms_dict) + info["atoms"] = atoms_dict + + except Exception as e: + print( + f"[ERROR] Failed to generate a valid structure for sample {idx} " + f"(input: {i}): {e}" + ) + # optional: print full traceback for debugging + print(traceback.format_exc()) + info["error"] = str(e) + # do NOT re-raise; just skip this structure and move on + mem.append(info) + continue + mem.append(info) + dumpjson(data=mem, filename=fname) return model, tokenizer, temp_config From 8902e7099deae5e17f480c2ad546052eac2c3eae Mon Sep 17 00:00:00 2001 From: Charles Campbell Date: Wed, 5 Nov 2025 12:50:42 -0500 Subject: [PATCH 26/50] num_proc --- atomgpt/inverse_models/inverse_models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index b4cc788..b9e337c 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -569,10 +569,12 @@ def tokenize_function(example): train_dataset = train_dataset.map( formatting_prompts_func_with_prompt, batched=True, + num_proc=config.dataset_num_proc ) eval_dataset = eval_dataset.map( formatting_prompts_func_with_prompt, batched=True, + num_proc=config.dataset_num_proc ) # Compute the actual max sequence length in raw text lengths = [ @@ -582,8 +584,8 @@ def tokenize_function(example): max_seq_length = max(lengths) print(f"🧠 Suggested max_seq_length based on dataset: {max_seq_length}") - tokenized_train = train_dataset.map(tokenize_function, batched=True) - tokenized_eval = eval_dataset.map(tokenize_function, batched=True) + tokenized_train = train_dataset.map(tokenize_function, batched=True, num_proc=config.dataset_num_proc) + tokenized_eval = eval_dataset.map(tokenize_function, batched=True, num_proc=config.dataset_num_proc) tokenized_train.set_format( type="torch", columns=["input_ids", "attention_mask", "output"] ) From 431e3f91509619c4154099244c7e85232b47c0d8 Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Tue, 18 Nov 2025 12:36:10 -0500 Subject: [PATCH 27/50] if gen_mat = None: ... --- atomgpt/inverse_models/inverse_predict.py | 108 ++++++++-------------- 1 file changed, 39 insertions(+), 69 deletions(-) diff --git a/atomgpt/inverse_models/inverse_predict.py b/atomgpt/inverse_models/inverse_predict.py index 003cc08..9e36a57 100644 --- a/atomgpt/inverse_models/inverse_predict.py +++ b/atomgpt/inverse_models/inverse_predict.py @@ -79,13 +79,10 @@ def relax_atoms( calculator = AlignnAtomwiseCalculator(path=default_path(), device="cpu") t1 = time.time() - # if calculator is None: - # return atoms ase_atoms = atoms.ase_converter() ase_atoms.calc = calculator ase_atoms = ExpCellFilter(ase_atoms, constant_volume=constant_volume) - # TODO: Make it work with any other optimizer dyn = FIRE(ase_atoms) dyn.run(fmax=fmax, steps=nsteps) en = ase_atoms.atoms.get_potential_energy() @@ -111,8 +108,8 @@ def predict( prop_val=None, dtype=None, max_seq_length=1058, - load_in_4bit=None, - verbose=True, + load_in_4bit=None, # temp_config["load_in_4bit"] + verbose=True, # temp_config["load_in_4bit"] ): print("config_path", config_path) if output_dir is not None: @@ -139,6 +136,7 @@ def predict( pprint.pprint(temp_config) if model_name is None: model_name = temp_config["model_name"] + # output_dir = temp_config["output_dir"] if load_in_4bit is None: load_in_4bit = temp_config["load_in_4bit"] @@ -146,7 +144,6 @@ def predict( print("Model used:", model_name) print("config used:", config_path) print("formula:", formula) - model = None tokenizer = None try: @@ -158,28 +155,29 @@ def predict( device_map="auto", ) FastLanguageModel.for_inference(model) - except Exception: + except: tokenizer = AutoTokenizer.from_pretrained( model_name, gguf_file=filename ) model = AutoModelForCausalLM.from_pretrained( model_name, gguf_file=filename ) - + pass atoms_arr = [] lines = [] if formula is None: - with open(pred_csv, "r") as f: - lines = f.read().splitlines() + # if dat_path is None: + f = open(pred_csv, "r") + lines = f.read().splitlines() + f.close() else: if dat_path is not None: lines = [dat_path] - else: - lines = [formula] + lines = [formula] mem = [] - for idx, i in enumerate(lines): + for i in lines: prompt = i if ".dat" in i or dat_path is not None: if dat_path is None: @@ -222,74 +220,47 @@ def predict( ) if verbose: - print(f"[{idx}] prompt:", prompt.replace("\n", ",")) - - info = {"prompt": prompt} - gen_mat = None + print("prompt here", prompt.replace("\n", ",")) - # --- NEW: robust error handling around generation / structure use --- - try: - gen_mat = gen_atoms( - prompt=prompt, - model=model, - tokenizer=tokenizer, - alpaca_prompt=temp_config["alpaca_prompt"], - instruction=temp_config["instruction"], - device=device, - ) - - if verbose: - print(f"[{idx}] gen atoms:", gen_mat) - # spacegroup() can fail for broken structures, so guard it - try: - print(f"[{idx}] gen atoms spacegroup:", gen_mat.spacegroup()) - except Exception as e_sg: - print( - f"[WARN] Failed to compute spacegroup for sample {idx}: {e_sg}" - ) - - if relax: - try: - gen_mat = relax_atoms(atoms=gen_mat) - if verbose: - print( - f"[{idx}] gen atoms relax:", - gen_mat, - gen_mat.spacegroup(), - ) - except Exception as e_relax: - print( - f"[WARN] Relaxation failed for sample {idx}, " - "continuing with unrelaxed structure." - ) - print(traceback.format_exc()) - - # this is another common crash point if gen_mat is invalid - atoms_dict = gen_mat.to_dict() - atoms_arr.append(atoms_dict) - info["atoms"] = atoms_dict + gen_mat = gen_atoms( + prompt=prompt, + model=model, + tokenizer=tokenizer, + alpaca_prompt=temp_config["alpaca_prompt"], + instruction=temp_config["instruction"], + device=device, + ) - except Exception as e: + if gen_mat is None: print( - f"[ERROR] Failed to generate a valid structure for sample {idx} " - f"(input: {i}): {e}" + "The returned structure is invalid. Here is the output:", + gen_mat, ) - # optional: print full traceback for debugging - print(traceback.format_exc()) - info["error"] = str(e) - # do NOT re-raise; just skip this structure and move on + info = {} + info["prompt"] = prompt + info["error"] = "Invalid structure returned by AtomGPT (None)." mem.append(info) + # skip the rest of the loop for this entry continue + if verbose: + print("gen atoms", gen_mat) + print("gen atoms spacegroup", gen_mat.spacegroup()) + print("intvl", intvl) + if relax: + gen_mat = relax_atoms(atoms=gen_mat) + if verbose: + print("gen atoms relax", gen_mat, gen_mat.spacegroup()) + atoms_arr.append(gen_mat.to_dict()) + info = {} + info["prompt"] = prompt + info["atoms"] = gen_mat.to_dict() mem.append(info) - dumpjson(data=mem, filename=fname) return model, tokenizer, temp_config if __name__ == "__main__": - # output_dir = make_id_prop() - # output_dir="." args = parser.parse_args(sys.argv[1:]) print("args.config_path", args.config_path) predict( @@ -302,5 +273,4 @@ def predict( config_path=args.config_path, prop_val=args.prop_val, background_subs=args.background_subs, - # config_name=args.config_name, ) From 3bbd9d2293d13bdafb95831a2e4482a8c669de62 Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Tue, 18 Nov 2025 12:42:01 -0500 Subject: [PATCH 28/50] rm "Here is the output" --- atomgpt/inverse_models/inverse_predict.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/atomgpt/inverse_models/inverse_predict.py b/atomgpt/inverse_models/inverse_predict.py index 9e36a57..109e3f9 100644 --- a/atomgpt/inverse_models/inverse_predict.py +++ b/atomgpt/inverse_models/inverse_predict.py @@ -233,8 +233,7 @@ def predict( if gen_mat is None: print( - "The returned structure is invalid. Here is the output:", - gen_mat, + "The structure returned by gen_mat() is not a valid crystal structure. ) info = {} info["prompt"] = prompt From e3d22e18eb0b773bf785cea0c86efdf3f96834ae Mon Sep 17 00:00:00 2001 From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com> Date: Tue, 18 Nov 2025 12:43:10 -0500 Subject: [PATCH 29/50] terminate string literal --- atomgpt/inverse_models/inverse_predict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomgpt/inverse_models/inverse_predict.py b/atomgpt/inverse_models/inverse_predict.py index 109e3f9..63054cf 100644 --- a/atomgpt/inverse_models/inverse_predict.py +++ b/atomgpt/inverse_models/inverse_predict.py @@ -233,7 +233,7 @@ def predict( if gen_mat is None: print( - "The structure returned by gen_mat() is not a valid crystal structure. + "The structure returned by gen_mat() is not a valid crystal structure." ) info = {} info["prompt"] = prompt From d9c94afa78f476875af154734e8deef91998d041 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Mon, 1 Dec 2025 12:03:40 -0500 Subject: [PATCH 30/50] let tokenizers be >= 0.22.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7249477..40ae609 100644 --- a/requirements.txt +++ b/requirements.txt @@ -98,7 +98,7 @@ SQLAlchemy==2.0.43 sympy==1.14.0 threadpoolctl==3.6.0 tifffile==2025.5.10 -tokenizers==0.21.1 +tokenizers>=0.22.0 tomli==2.2.1 toolz==1.0.0 torch==2.7.0 From 5b4ef60f4ff3359f2820b92946118db752491aaf Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Mon, 1 Dec 2025 12:06:35 -0500 Subject: [PATCH 31/50] hf hub >= 0.32.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 40ae609..aa7cd60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,7 +28,7 @@ gguf==0.16.3 greenlet==3.2.4 hf-xet==1.1.2 hf_transfer==0.1.9 -huggingface-hub==0.32.0 +huggingface-hub>=0.32.0 idna==3.10 imageio==2.37.0 importlib_metadata==8.7.0 From 32e831d5382ebc11d31c5d801a7ead42246ecfac Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Mon, 1 Dec 2025 12:10:42 -0500 Subject: [PATCH 32/50] hf-xet>=1.1.2 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index aa7cd60..61e4f7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,7 +26,7 @@ frozenlist==1.6.0 fsspec==2025.3.0 gguf==0.16.3 greenlet==3.2.4 -hf-xet==1.1.2 +hf-xet>=1.1.2 hf_transfer==0.1.9 huggingface-hub>=0.32.0 idna==3.10 From 006cf16be47170388d5aa45e1fcbec5889a1db7d Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Thu, 4 Dec 2025 12:26:32 -0500 Subject: [PATCH 33/50] print target and predicted structures if PRINT_STRUCTURES=1 --- atomgpt/inverse_models/inverse_models.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index b9e337c..a672d2f 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -293,6 +293,12 @@ def evaluate( miss_writer.writerow([sample_id, "prediction", "invalid_prediction", gen_err, ""]) continue + if os.environ.get("PRINT_STRUCTURES"): + print("Target Structure:") + print(target_mat) + print("Predicted Structure:") + print(gen_mat) + try: ok_writer.writerow([ sample_id, From 786ddacae193f07e87c7475caf5cc39525d50b30 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Thu, 4 Dec 2025 12:36:21 -0500 Subject: [PATCH 34/50] mv print statements before validation checks --- atomgpt/inverse_models/inverse_models.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index a672d2f..80f9e69 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -263,11 +263,17 @@ def evaluate( target_err = None try: target_mat = text2atoms("\n" + i["output"]) + if os.environ.get("PRINT_STRUCTURES"): + print(f"Target Structure ({sample_id}):") + print(target_mat) + ok, detail = _validate_atoms(target_mat) if not ok: target_err = detail except Exception as e: target_err = f"text2atoms:{type(e).__name__}:{e}" + if os.environ.get("PRINT_STRUCTURES"): + print(f"Target Structure ({sample_id}) FAILED: {target_err}") if target_err: miss_writer.writerow([sample_id, "target", "invalid_target", target_err, (i.get("output","")[:240])]) @@ -283,22 +289,22 @@ def evaluate( alpaca_prompt=config.alpaca_prompt, instruction=config.instruction, ) + if os.environ.get("PRINT_STRUCTURES"): + print(f"Predicted Structure ({sample_id}):") + print(gen_mat) + ok, detail = _validate_atoms(gen_mat) if not ok: gen_err = detail except Exception as e: gen_err = f"gen_atoms:{type(e).__name__}:{e}" + if os.environ.get("PRINT_STRUCTURES"): + print(f"Predicted Structure ({sample_id}) FAILED: {gen_err}") if gen_err: miss_writer.writerow([sample_id, "prediction", "invalid_prediction", gen_err, ""]) continue - if os.environ.get("PRINT_STRUCTURES"): - print("Target Structure:") - print(target_mat) - print("Predicted Structure:") - print(gen_mat) - try: ok_writer.writerow([ sample_id, From 76602a662e062e1c4a996b39c9f1dc09dffcd6e9 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Thu, 4 Dec 2025 12:47:30 -0500 Subject: [PATCH 35/50] let the raw LLM output be printed if PRINT_STRUCTURES=1 --- atomgpt/inverse_models/inverse_models.py | 5 ++++- atomgpt/inverse_models/utils.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index 80f9e69..a24e998 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -281,8 +281,9 @@ def evaluate( gen_mat = None gen_err = None + raw_response = "" try: - gen_mat = gen_atoms( + gen_mat, raw_response = gen_atoms( prompt=i["input"], tokenizer=tokenizer, model=model, @@ -300,6 +301,8 @@ def evaluate( gen_err = f"gen_atoms:{type(e).__name__}:{e}" if os.environ.get("PRINT_STRUCTURES"): print(f"Predicted Structure ({sample_id}) FAILED: {gen_err}") + print(f"Raw LLM Output ({sample_id}):") + print(raw_response) if gen_err: miss_writer.writerow([sample_id, "prediction", "invalid_prediction", gen_err, ""]) diff --git a/atomgpt/inverse_models/utils.py b/atomgpt/inverse_models/utils.py index b5d441f..6395b24 100644 --- a/atomgpt/inverse_models/utils.py +++ b/atomgpt/inverse_models/utils.py @@ -175,7 +175,7 @@ def gen_atoms( print(exp) pass - return atoms + return atoms, response def get_crystal_string_t(atoms): @@ -381,7 +381,7 @@ def main_spectra( + " Generate atomic structure description with lattice lengths, angles, coordinates and atom types." ) # print(info) - atoms = gen_atoms( + atoms, _ = gen_atoms( prompt=info["input"], model=model, alpaca_prompt=alpaca_prompt, From 33e52c87ab49b159a0adaa7b8a6badc306e517b5 Mon Sep 17 00:00:00 2001 From: crhysc Date: Mon, 8 Dec 2025 20:45:01 -0500 Subject: [PATCH 36/50] initialize abs factory for loading. add chat template stubs --- atomgpt/inverse_models/factories.py | 152 +++++++++++++++++++++++ atomgpt/inverse_models/inverse_models.py | 76 ++---------- atomgpt/inverse_models/products.py | 16 +++ 3 files changed, 181 insertions(+), 63 deletions(-) create mode 100644 atomgpt/inverse_models/factories.py create mode 100644 atomgpt/inverse_models/products.py diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py new file mode 100644 index 0000000..6e0d0ef --- /dev/null +++ b/atomgpt/inverse_models/factories.py @@ -0,0 +1,152 @@ +# factories.py + +from abc import ABC, abstractmethod +from .products import LoadedModel, ChatTemplate +from .inverse_models import TrainingPropConfig +from peft import PeftModel +from .loader import FastLanguageModel as AtomGPTFastLanguageModel +from unsloth import FastLanguageModel as UnslothFastLanguageModel +from typing import Dict + + +class LanguageModelFactory(ABC): + @abstractmethod + def load_for_training(self, config: TrainingPropConfig) -> LoadedModel: + pass + + @abstractmethod + def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -> LoadedModel: + pass + + @abstractmethod + def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate: + pass + + +class AlpacaTemplate: + def format(self, instruction: str, user_input: str, output: str | None = None) -> str: + if output is None: + output = "" + return f"### Instruction:\n{instruction}\n### Input:\n{user_input}\n### Output:\n{output}" + + +class HarmonyTemplate: + def format(self, instruction: str, user_input: str, output: str | None = None) -> str: + pass + + +class AtomGPTFactory(LanguageModelFactory): + def load_for_training(self, config: TrainingPropConfig) -> LoadedModel: + model, tokenizer = AtomGPTFastLanguageModel.from_pretrained( + model_name=config.model_name, + max_seq_length=config.max_seq_length, + dtype=config.dtype, + load_in_4bit=config.load_in_4bit + ) + if not isinstance(model, PeftModel): + # import sys + print("Not yet a peft model, converting into peft model") + # sys.exit() + model = FastLanguageModel.get_peft_model( + model, + r=config.lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], + lora_alpha=config.lora_alpha, + lora_dropout=0, # Supports any, but = 0 is optimized + bias="none", # Supports any, but = "none" is optimized + use_gradient_checkpointing=True, + random_state=3407, + use_rslora=False, # We support rank stabilized LoRA + loftq_config=None, # And LoftQ + ) + print("Peft model created") + EOS_TOKEN = tokenizer.eos_token + return LoadedModel(model=model, tokenizer=tokenizer) + + def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -> LoadedModel: + model, tokenizer = FastLanguageModel.from_pretrained( + model_name=checkpoint_path, + max_seq_length=config.max_seq_length, + dtype=config.dtype, + load_in_4bit=config.load_in_4bit, + ) + FastLanguageModel.for_inference(model) + return LoadedModel(model=model, tokenizer=tokenizer)) + + def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate: + return AlpacaTemplate() + + +class GPTOSSFactory(LanguageModelFactory): + def load_for_training(self, config: TrainingPropConfig) -> LoadedModel: + model, tokenizer = UnslothFastLanguageModel.from_pretrained( + model_name=config.model_name, + max_seq_length=config.max_seq_length, + dtype=config.dtype, + load_in_4bit=config.load_in_4bit, + full_finetuning = False, + ) + if not isinstance(model, PeftModel): + print("Not yet a peft model, converting into peft model") + model = FastLanguageModel.get_peft_model( + model, + r=config.lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], + lora_alpha=config.lora_alpha, + lora_dropout=0, # Supports any, but = 0 is optimized + bias="none", # Supports any, but = "none" is optimized + use_gradient_checkpointing=unsloth, + random_state=3407, + use_rslora=False, # We support rank stabilized LoRA + loftq_config=None, # And LoftQ + ) + print("Peft model created") + return LoadedModel(model=model, tokenizer=tokenizer) + + def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -> LoadedModel: + model, tokenizer = UnslothFastLanguageModel.from_pretrained( + model_name=checkpoint_path, + max_seq_length=config.max_seq_length, + dtype=config.dtype, + load_in_4bit=config.load_in_4bit, + ) + FastLanguageModel.for_inference(model) + return LoadedModel(model=model, tokenizer=tokenizer) + + def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate: + return HarmonyTemplate() + + +FACTORY_REGISTRY: Dict[str, type[LanguageModelFactory]] = { + "gemma": AtomGPTFactory, + "qwen": AtomGPTFactory, + "Meta": AtomGPTFactory, + "Llama": AtomGPTFactory, + "llama": AtomGPTFactory, + "Mistral": AtomGPTFactory, + "mistral": AtomGPTFactory, + "gpt-oss": GPTOssFactory, +} + +def get_lm_factory(config: TrainingPropConfig) -> LanguageModelFactory: + model_name = config.model_name + factory_cls = FACTORY_REGISTRY.get(model_name.split("/", 1)[1].split("-", 1)[0]) + if factory_cls is None: + raise ValueError(f"Unsupported model name: {model_name}. No model factory found.") + return factory_cls() diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index a24e998..08f2f29 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -1,5 +1,9 @@ from typing import Optional +from typing import Dict +from typing import Literal from atomgpt.inverse_models.loader import FastLanguageModel +from .factories import LanguageModelFactory, get_lm_factory +from .products import LoadedModel, ChatTemplate # from unsloth import FastLanguageModel from atomgpt.inverse_models.callbacks import ( @@ -32,9 +36,9 @@ import sys import json import argparse -from typing import Literal import time from jarvis.core.composition import Composition +import traceback # from atomgpt.inverse_models.custom_trainer import CustomSFTTrainer @@ -301,6 +305,7 @@ def evaluate( gen_err = f"gen_atoms:{type(e).__name__}:{e}" if os.environ.get("PRINT_STRUCTURES"): print(f"Predicted Structure ({sample_id}) FAILED: {gen_err}") + print(traceback.format_exc()) print(f"Raw LLM Output ({sample_id}):") print(raw_response) @@ -521,38 +526,10 @@ def main(config_file=None): print(alpaca_prop_test_filename, "exists") m_test = loadjson(alpaca_prop_test_filename) - # 4bit pre quantized models we support for 4x faster downloading + no OOMs. - model, tokenizer = FastLanguageModel.from_pretrained( - model_name=config.model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B - max_seq_length=config.max_seq_length, - dtype=config.dtype, - load_in_4bit=config.load_in_4bit, - # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf - ) - if not isinstance(model, PeftModel): - # import sys - print("Not Peft model") - # sys.exit() - model = FastLanguageModel.get_peft_model( - model, - r=config.lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 - target_modules=[ - "q_proj", - "k_proj", - "v_proj", - "o_proj", - "gate_proj", - "up_proj", - "down_proj", - ], - lora_alpha=config.lora_alpha, - lora_dropout=0, # Supports any, but = 0 is optimized - bias="none", # Supports any, but = "none" is optimized - use_gradient_checkpointing=True, - random_state=3407, - use_rslora=False, # We support rank stabilized LoRA - loftq_config=None, # And LoftQ - ) + factory = get_lm_factory(config) + loaded: LoadedModel = factory.load_for_training(config) + model, tokenizer = loaded.model, loaded.tokenizer + chat_template = factory.create_chat_template(config) EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN # tokenizer.pad_token_id = tokenizer.eos_token_id @@ -647,8 +624,6 @@ def tokenize_function(example): trainer = SFTTrainer( model=model, train_dataset=tokenized_train, - # train_dataset = train_dataset, - # tokenizer = tokenizer, args=SFTConfig( dataset_text_field="text", max_seq_length=config.max_seq_length, @@ -672,44 +647,19 @@ def tokenize_function(example): if callback_samples > 0: callback = ExampleTrainerCallback( some_tokenized_dataset=tokenized_eval, - # some_tokenized_dataset=tokenized_eval, tokenizer=tokenizer, max_length=config.max_seq_length, callback_samples=callback_samples, ) trainer.add_callback(callback) + gpu_usage = PrintGPUUsageCallback() trainer.add_callback(gpu_usage) - trainer_stats = trainer.train() + trainer_stats = trainer.train(resume_from_checkpoint=True) trainer.save_model(config.model_save_path) - # model.save_pretrained(config.model_save_path) - # model, tokenizer = FastLanguageModel.from_pretrained( - # model_name=config.model_save_path, # YOUR MODEL YOU USED FOR TRAINING - # max_seq_length=config.max_seq_length, - # dtype=config.dtype, - # load_in_4bit=config.load_in_4bit, - # ) model = trainer.model - FastLanguageModel.for_inference(model) # Enable native 2x faster inference - # model, tokenizer, config = load_model(path=config.model_save_path) - # batch_evaluate( - # prompts=[i["input"] for i in m_test], - # model=model, - # tokenizer=tokenizer, - # csv_out=config.csv_out, - # config=config, - # ) - # t1 = time.time() - # batch_evaluate( - # test_set=m_test, - # model=model, - # tokenizer=tokenizer, - # csv_out=config.csv_out, - # config=config, - # ) - # t2 = time.time() - # t1a = time.time() + FastLanguageModel.for_inference(model) evaluate( test_set=m_test, model=model, diff --git a/atomgpt/inverse_models/products.py b/atomgpt/inverse_models/products.py new file mode 100644 index 0000000..2f12923 --- /dev/null +++ b/atomgpt/inverse_models/products.py @@ -0,0 +1,16 @@ +# products.py + +from dataclasses import dataclass +from typing import Protocol, Any +import torch +from transformers import PreTrainedTokenizerBase + +@dataclass +class LoadedModel: + model: torch.nn.Module + tokenizer: PreTrainedTokenizerBase + + +class ChatTemplate(Protocol): + def format() -> str: + pass From d0dd79354b1bfee51cc82f32f3542f93c8d00e67 Mon Sep 17 00:00:00 2001 From: crhysc Date: Mon, 8 Dec 2025 20:46:24 -0500 Subject: [PATCH 37/50] add kwargs to format() --- atomgpt/inverse_models/products.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomgpt/inverse_models/products.py b/atomgpt/inverse_models/products.py index 2f12923..08aae56 100644 --- a/atomgpt/inverse_models/products.py +++ b/atomgpt/inverse_models/products.py @@ -12,5 +12,5 @@ class LoadedModel: class ChatTemplate(Protocol): - def format() -> str: + def format(self, instruction: str, user_input: str, output: str | None = None) -> str: pass From 1970a36fa401cf3d0abbfa3136edb2e49befb38e Mon Sep 17 00:00:00 2001 From: crhysc Date: Sun, 14 Dec 2025 04:25:09 -0500 Subject: [PATCH 38/50] get harmony template in factory --- atomgpt/inverse_models/dataset_utils.py | 65 +++++++++++++++++++- atomgpt/inverse_models/factories.py | 51 +++++++--------- atomgpt/inverse_models/inverse_models.py | 75 ++---------------------- atomgpt/inverse_models/products.py | 4 +- 4 files changed, 91 insertions(+), 104 deletions(-) diff --git a/atomgpt/inverse_models/dataset_utils.py b/atomgpt/inverse_models/dataset_utils.py index 8bf6b81..9e047ec 100644 --- a/atomgpt/inverse_models/dataset_utils.py +++ b/atomgpt/inverse_models/dataset_utils.py @@ -753,6 +753,67 @@ def _tokenize(example): ) pass return dataset - - pass + +def make_alpaca_json( + dataset=[], + jids=[], + # prop="Tc_supercon", + # instruction="", + include_jid=False, + # chem_info="", + # output_prompt="", + config=None, +): + mem = [] + print("config.prop", config.prop) + for i in dataset: + if i[config.prop] != "na" and i[config.id_tag] in jids: + atoms = Atoms.from_dict(i["atoms"]) + info = {} + if include_jid: + info["id"] = i[config.id_tag] + info["instruction"] = config.instruction + if config.chem_info == "none": + chem = "" + elif config.chem_info == "element_list": + chem = atoms.composition.search_string + elif config.chem_info == "element_dict": + comp = Composition.from_string( + atoms.composition.reduced_formula + ) + chem = comp.to_dict() + chem = str(dict(sorted(chem.items()))) + elif config.chem_info == "formula": + chem = atoms.composition.reduced_formula + + inp = get_input(config=config, val=i[config.prop], chem=chem) + info["input"] = inp + + info["output"] = get_crystal_string_t(atoms) + mem.append(info) + return mem + +def alpaca_formatting_prompts_func(examples: Dict[str, Any], alpaca_prompt: str, eos_token: str) -> Dict[str, List[str]]: + inst = examples["instruction"] + inp = examples["input"] + out = examples["output"] + texts = [alpaca_prompt.format(i, x, y) + eos_token for i, x, y in zip(inst, inp, out)] + return {"text": texts} + +def harmony_formatting_prompts_func(examples: Dict[str, Any], tokenizer) -> Dict[str, List[str]]: + inst = examples["instruction"] + inp = examples["input"] + out = examples["output"] + texts: List[str] = [] + for i, x, y in zip(inst, inp, out): + messages = [] + i = (i or "").strip() + x = (x or "").strip() + y = (y or "").strip() + if i: + messages.append({"role": "developer", "content": i}) + messages.append({"role": "user", "content": x}) + messages.append({"role": "assistant", "content": y}) + texts.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)) + return {"text": texts} diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py index 6e0d0ef..c9b5c2d 100644 --- a/atomgpt/inverse_models/factories.py +++ b/atomgpt/inverse_models/factories.py @@ -2,11 +2,16 @@ from abc import ABC, abstractmethod from .products import LoadedModel, ChatTemplate +from typing import Callable from .inverse_models import TrainingPropConfig from peft import PeftModel from .loader import FastLanguageModel as AtomGPTFastLanguageModel from unsloth import FastLanguageModel as UnslothFastLanguageModel from typing import Dict +from .dataset_utils import alpaca_formatting_prompts_func +from .dataset_utils import harmony_formatting_prompts_func +from functools import partial +from typing import List class LanguageModelFactory(ABC): @@ -19,19 +24,7 @@ def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) - pass @abstractmethod - def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate: - pass - - -class AlpacaTemplate: - def format(self, instruction: str, user_input: str, output: str | None = None) -> str: - if output is None: - output = "" - return f"### Instruction:\n{instruction}\n### Input:\n{user_input}\n### Output:\n{output}" - - -class HarmonyTemplate: - def format(self, instruction: str, user_input: str, output: str | None = None) -> str: + def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable: pass @@ -47,7 +40,7 @@ def load_for_training(self, config: TrainingPropConfig) -> LoadedModel: # import sys print("Not yet a peft model, converting into peft model") # sys.exit() - model = FastLanguageModel.get_peft_model( + model = AtomGPTFastLanguageModel.get_peft_model( model, r=config.lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 target_modules=[ @@ -78,11 +71,12 @@ def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) - dtype=config.dtype, load_in_4bit=config.load_in_4bit, ) - FastLanguageModel.for_inference(model) - return LoadedModel(model=model, tokenizer=tokenizer)) + AtomGPTFastLanguageModel.for_inference(model) + return LoadedModel(model=model, tokenizer=tokenizer) - def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate: - return AlpacaTemplate() + def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable: + eos = tokenizer.eos_token or "" + return partial(alpaca_formatting_prompts_func, alpaca_prompt=config.alpaca_prompt, eos_token=eos) class GPTOSSFactory(LanguageModelFactory): @@ -96,7 +90,7 @@ def load_for_training(self, config: TrainingPropConfig) -> LoadedModel: ) if not isinstance(model, PeftModel): print("Not yet a peft model, converting into peft model") - model = FastLanguageModel.get_peft_model( + model = UnslothFastLanguageModel.get_peft_model( model, r=config.lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 target_modules=[ @@ -111,7 +105,7 @@ def load_for_training(self, config: TrainingPropConfig) -> LoadedModel: lora_alpha=config.lora_alpha, lora_dropout=0, # Supports any, but = 0 is optimized bias="none", # Supports any, but = "none" is optimized - use_gradient_checkpointing=unsloth, + use_gradient_checkpointing=True, random_state=3407, use_rslora=False, # We support rank stabilized LoRA loftq_config=None, # And LoftQ @@ -126,12 +120,11 @@ def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) - dtype=config.dtype, load_in_4bit=config.load_in_4bit, ) - FastLanguageModel.for_inference(model) + UnslothFastLanguageModel.for_inference(model) return LoadedModel(model=model, tokenizer=tokenizer) - def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate: - return HarmonyTemplate() - + def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable: + return partial(harmony_formatting_prompts_func, tokenizer=tokenizer) FACTORY_REGISTRY: Dict[str, type[LanguageModelFactory]] = { "gemma": AtomGPTFactory, @@ -141,12 +134,12 @@ def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate: "llama": AtomGPTFactory, "Mistral": AtomGPTFactory, "mistral": AtomGPTFactory, - "gpt-oss": GPTOssFactory, + "gpt-oss": GPTOSSFactory, } def get_lm_factory(config: TrainingPropConfig) -> LanguageModelFactory: model_name = config.model_name - factory_cls = FACTORY_REGISTRY.get(model_name.split("/", 1)[1].split("-", 1)[0]) - if factory_cls is None: - raise ValueError(f"Unsupported model name: {model_name}. No model factory found.") - return factory_cls() + if "gpt-oss" in model_name: + return GPTOSSFactory() + else: + return AtomGPTFactory() diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index 08f2f29..169cb0c 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -145,62 +145,6 @@ def get_input(config=None, chem="", val=10): ) return inp - -def make_alpaca_json( - dataset=[], - jids=[], - # prop="Tc_supercon", - # instruction="", - include_jid=False, - # chem_info="", - # output_prompt="", - config=None, -): - mem = [] - print("config.prop", config.prop) - for i in dataset: - if i[config.prop] != "na" and i[config.id_tag] in jids: - atoms = Atoms.from_dict(i["atoms"]) - info = {} - if include_jid: - info["id"] = i[config.id_tag] - info["instruction"] = config.instruction - if config.chem_info == "none": - chem = "" - elif config.chem_info == "element_list": - chem = atoms.composition.search_string - elif config.chem_info == "element_dict": - comp = Composition.from_string( - atoms.composition.reduced_formula - ) - chem = comp.to_dict() - chem = str(dict(sorted(chem.items()))) - elif config.chem_info == "formula": - chem = atoms.composition.reduced_formula - - inp = get_input(config=config, val=i[config.prop], chem=chem) - info["input"] = inp - - info["output"] = get_crystal_string_t(atoms) - mem.append(info) - return mem - - -def formatting_prompts_func(examples, alpaca_prompt): - instructions = examples["instruction"] - inputs = examples["input"] - outputs = examples["output"] - texts = [] - EOS_TOKEN = "" - for instruction, input, output in zip(instructions, inputs, outputs): - # Must add EOS_TOKEN, otherwise your generation will go on forever! - text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN - texts.append(text) - return { - "text": texts, - } - - def load_model(path="", config=None): if config is None: config_file = os.path.join(path, "config.json") @@ -410,7 +354,6 @@ def batch_evaluate( def main(config_file=None): if config_file is None: - args = parser.parse_args(sys.argv[1:]) config_file = args.config_name if not torch.cuda.is_available(): @@ -437,17 +380,13 @@ def main(config_file=None): run_path = os.path.dirname(id_prop_path) num_train = config.num_train num_test = config.num_test - # model_name = config.model_name callback_samples = config.callback_samples - # loss_function = config.loss_function - # id_prop_path = os.path.join(run_path, id_prop_path) with open(id_prop_path, "r") as f: reader = csv.reader(f) dt = [row for row in reader] if not num_train: num_test = int(len(dt) * config.test_ratio) num_train = len(dt) - num_test - dat = [] ids = [] for i in tqdm(dt, total=len(dt)): @@ -486,7 +425,7 @@ def main(config_file=None): print("num_train", num_train) print("num_test", num_test) test_ids = ids[num_train : num_train + num_test] - # test_ids = ids[num_train:] + alpaca_prop_train_filename = os.path.join( config.output_dir, "alpaca_prop_train.json" ) @@ -529,11 +468,8 @@ def main(config_file=None): factory = get_lm_factory(config) loaded: LoadedModel = factory.load_for_training(config) model, tokenizer = loaded.model, loaded.tokenizer - chat_template = factory.create_chat_template(config) + formatting_prompts_func = factory.get_formatting_prompts_func(config, model, tokenizer) - EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN - # tokenizer.pad_token_id = tokenizer.eos_token_id - # model.resize_token_embeddings(len(tokenizer)) train_dataset = load_dataset( "json", data_files=alpaca_prop_train_filename, @@ -546,9 +482,6 @@ def main(config_file=None): split="train", # "json", data_files="alpaca_prop_train.json", split="train" ) - formatting_prompts_func_with_prompt = partial( - formatting_prompts_func, alpaca_prompt=config.alpaca_prompt - ) def tokenize_function(example): return tokenizer( @@ -559,12 +492,12 @@ def tokenize_function(example): ) train_dataset = train_dataset.map( - formatting_prompts_func_with_prompt, + formatting_prompts_func, batched=True, num_proc=config.dataset_num_proc ) eval_dataset = eval_dataset.map( - formatting_prompts_func_with_prompt, + formatting_prompts_func, batched=True, num_proc=config.dataset_num_proc ) diff --git a/atomgpt/inverse_models/products.py b/atomgpt/inverse_models/products.py index 08aae56..3de3171 100644 --- a/atomgpt/inverse_models/products.py +++ b/atomgpt/inverse_models/products.py @@ -11,6 +11,6 @@ class LoadedModel: tokenizer: PreTrainedTokenizerBase -class ChatTemplate(Protocol): - def format(self, instruction: str, user_input: str, output: str | None = None) -> str: +class DatasetFormattingFunction(Protocol): + def get_formatting_prompts_func() -> function: pass From 11b2ab6eefee862285d46d7315c5bd4f0aa5a02c Mon Sep 17 00:00:00 2001 From: crhysc Date: Sun, 14 Dec 2025 04:29:27 -0500 Subject: [PATCH 39/50] from typing import Any --- atomgpt/inverse_models/dataset_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/atomgpt/inverse_models/dataset_utils.py b/atomgpt/inverse_models/dataset_utils.py index 9e047ec..535df28 100644 --- a/atomgpt/inverse_models/dataset_utils.py +++ b/atomgpt/inverse_models/dataset_utils.py @@ -6,6 +6,7 @@ from typing import Union, Callable, Optional, List, Dict import torch +from typing import Any # From https://www.geeksforgeeks.org/longest-common-substring-array-strings/ From 52228edf97fde0003f4b9c74ac179a9e8fea54a3 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Sun, 14 Dec 2025 04:33:08 -0500 Subject: [PATCH 40/50] remove relative import --- atomgpt/inverse_models/factories.py | 10 +++++----- atomgpt/inverse_models/inverse_models.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py index c9b5c2d..892aeca 100644 --- a/atomgpt/inverse_models/factories.py +++ b/atomgpt/inverse_models/factories.py @@ -1,15 +1,15 @@ # factories.py from abc import ABC, abstractmethod -from .products import LoadedModel, ChatTemplate +from atomgpt.inverse_models.products import LoadedModel, ChatTemplate from typing import Callable -from .inverse_models import TrainingPropConfig +from atomgpt.inverse_models.inverse_models import TrainingPropConfig from peft import PeftModel -from .loader import FastLanguageModel as AtomGPTFastLanguageModel +from atomgpt.inverse_models.loader import FastLanguageModel as AtomGPTFastLanguageModel from unsloth import FastLanguageModel as UnslothFastLanguageModel from typing import Dict -from .dataset_utils import alpaca_formatting_prompts_func -from .dataset_utils import harmony_formatting_prompts_func +from atomgpt.inverse_models.dataset_utils import alpaca_formatting_prompts_func +from atomgpt.inverse_models.dataset_utils import harmony_formatting_prompts_func from functools import partial from typing import List diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index 169cb0c..9c3b8ca 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -2,8 +2,8 @@ from typing import Dict from typing import Literal from atomgpt.inverse_models.loader import FastLanguageModel -from .factories import LanguageModelFactory, get_lm_factory -from .products import LoadedModel, ChatTemplate +from atomgpt.inverse_models.factories import LanguageModelFactory, get_lm_factory +from atomgpt.inverse_models.products import LoadedModel, ChatTemplate # from unsloth import FastLanguageModel from atomgpt.inverse_models.callbacks import ( From 1464ce13aec7b183feb249d2a506c875623a1551 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Sun, 14 Dec 2025 04:35:18 -0500 Subject: [PATCH 41/50] import callable --- atomgpt/inverse_models/products.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/atomgpt/inverse_models/products.py b/atomgpt/inverse_models/products.py index 3de3171..6004f81 100644 --- a/atomgpt/inverse_models/products.py +++ b/atomgpt/inverse_models/products.py @@ -1,7 +1,7 @@ # products.py from dataclasses import dataclass -from typing import Protocol, Any +from typing import Protocol, Any, Callable import torch from transformers import PreTrainedTokenizerBase @@ -12,5 +12,5 @@ class LoadedModel: class DatasetFormattingFunction(Protocol): - def get_formatting_prompts_func() -> function: + def get_formatting_prompts_func() -> Callable: pass From 9793ad838b5155e0b206fbaf6684ba6a28a4faa6 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Sun, 14 Dec 2025 04:36:25 -0500 Subject: [PATCH 42/50] remove import chattemplate --- atomgpt/inverse_models/factories.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py index 892aeca..988bdfd 100644 --- a/atomgpt/inverse_models/factories.py +++ b/atomgpt/inverse_models/factories.py @@ -1,7 +1,7 @@ # factories.py from abc import ABC, abstractmethod -from atomgpt.inverse_models.products import LoadedModel, ChatTemplate +from atomgpt.inverse_models.products import LoadedModel from typing import Callable from atomgpt.inverse_models.inverse_models import TrainingPropConfig from peft import PeftModel From 3cfc59a71daa149a7c74c325a261e14d718b2720 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Sun, 14 Dec 2025 04:40:13 -0500 Subject: [PATCH 43/50] remove imports to non-interface objects for model loading and chat templates --- atomgpt/inverse_models/inverse_models.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index 9c3b8ca..8cbdc1e 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -1,11 +1,8 @@ from typing import Optional from typing import Dict from typing import Literal -from atomgpt.inverse_models.loader import FastLanguageModel -from atomgpt.inverse_models.factories import LanguageModelFactory, get_lm_factory -from atomgpt.inverse_models.products import LoadedModel, ChatTemplate +from atomgpt.inverse_models.factories import get_lm_factory -# from unsloth import FastLanguageModel from atomgpt.inverse_models.callbacks import ( PrintGPUUsageCallback, ExampleTrainerCallback, From 6e8771ae61beb0e5039977a7e826de0d015562f6 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Sun, 14 Dec 2025 04:46:50 -0500 Subject: [PATCH 44/50] add type checking if statement for the trainingpropconfig import --- atomgpt/inverse_models/factories.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py index 988bdfd..a48a2c6 100644 --- a/atomgpt/inverse_models/factories.py +++ b/atomgpt/inverse_models/factories.py @@ -3,7 +3,9 @@ from abc import ABC, abstractmethod from atomgpt.inverse_models.products import LoadedModel from typing import Callable -from atomgpt.inverse_models.inverse_models import TrainingPropConfig +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from atomgpt.inverse_models.inverse_models import TrainingPropConfig from peft import PeftModel from atomgpt.inverse_models.loader import FastLanguageModel as AtomGPTFastLanguageModel from unsloth import FastLanguageModel as UnslothFastLanguageModel From 9a41780c2edeb7a5af461da480ea96a1d9fa74a9 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Sun, 14 Dec 2025 04:49:30 -0500 Subject: [PATCH 45/50] add unsloth>=2024.10,<2025.3 --- requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 61e4f7f..af7c05c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,9 +26,9 @@ frozenlist==1.6.0 fsspec==2025.3.0 gguf==0.16.3 greenlet==3.2.4 -hf-xet>=1.1.2 +hf-xet==1.1.2 hf_transfer==0.1.9 -huggingface-hub>=0.32.0 +huggingface-hub==0.32.0 idna==3.10 imageio==2.37.0 importlib_metadata==8.7.0 @@ -112,6 +112,7 @@ typing-inspection==0.4.1 typing_extensions==4.13.2 tyro==0.9.21 tzdata==2025.2 +unsloth>=2024.10,<2025.3 urllib3==2.4.0 uv==0.7.8 xformers==0.0.30 From 25661fec062f183a907d9d81eaa11fb0b9b219c9 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Sun, 14 Dec 2025 05:13:51 -0500 Subject: [PATCH 46/50] arrange imports to debug --- atomgpt/inverse_models/factories.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py index a48a2c6..44a6bf4 100644 --- a/atomgpt/inverse_models/factories.py +++ b/atomgpt/inverse_models/factories.py @@ -1,5 +1,7 @@ # factories.py +from __future__ import annotations + from abc import ABC, abstractmethod from atomgpt.inverse_models.products import LoadedModel from typing import Callable @@ -7,8 +9,6 @@ if TYPE_CHECKING: from atomgpt.inverse_models.inverse_models import TrainingPropConfig from peft import PeftModel -from atomgpt.inverse_models.loader import FastLanguageModel as AtomGPTFastLanguageModel -from unsloth import FastLanguageModel as UnslothFastLanguageModel from typing import Dict from atomgpt.inverse_models.dataset_utils import alpaca_formatting_prompts_func from atomgpt.inverse_models.dataset_utils import harmony_formatting_prompts_func @@ -32,6 +32,7 @@ def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable: class AtomGPTFactory(LanguageModelFactory): def load_for_training(self, config: TrainingPropConfig) -> LoadedModel: + from atomgpt.inverse_models.loader import FastLanguageModel as AtomGPTFastLanguageModel model, tokenizer = AtomGPTFastLanguageModel.from_pretrained( model_name=config.model_name, max_seq_length=config.max_seq_length, @@ -83,6 +84,7 @@ def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable: class GPTOSSFactory(LanguageModelFactory): def load_for_training(self, config: TrainingPropConfig) -> LoadedModel: + from unsloth import FastLanguageModel as UnslothFastLanguageModel model, tokenizer = UnslothFastLanguageModel.from_pretrained( model_name=config.model_name, max_seq_length=config.max_seq_length, From c8790f8f35bc55607f0585f3d9583412899fa3bd Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Sun, 14 Dec 2025 05:15:43 -0500 Subject: [PATCH 47/50] from atomgpt.inverse_models.dataset_utils import make_alpaca_json --- atomgpt/inverse_models/inverse_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index 8cbdc1e..c0b634d 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -36,6 +36,7 @@ import time from jarvis.core.composition import Composition import traceback +from atomgpt.inverse_models.dataset_utils import make_alpaca_json # from atomgpt.inverse_models.custom_trainer import CustomSFTTrainer From 780d334595fcc6b62d2707e691ae0acc082ba4bb Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Sun, 14 Dec 2025 05:18:25 -0500 Subject: [PATCH 48/50] add imports for make_alpaca_json() --- atomgpt/inverse_models/dataset_utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/atomgpt/inverse_models/dataset_utils.py b/atomgpt/inverse_models/dataset_utils.py index 535df28..ef676b1 100644 --- a/atomgpt/inverse_models/dataset_utils.py +++ b/atomgpt/inverse_models/dataset_utils.py @@ -7,6 +7,15 @@ from typing import Union, Callable, Optional, List, Dict import torch from typing import Any +from jarvis.core.atoms import Atoms +from jarvis.io.vasp.inputs import Poscar +from jarvis.core.composition import Composition +from atomgpt.inverse_models.utils import ( + gen_atoms, + text2atoms, + get_crystal_string_t, + get_figlet, +) # From https://www.geeksforgeeks.org/longest-common-substring-array-strings/ From 58bac34bdd53ee80e37b473882e4b91fd5a68e18 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Sun, 14 Dec 2025 05:20:02 -0500 Subject: [PATCH 49/50] mv get_input() to dataset_utils --- atomgpt/inverse_models/dataset_utils.py | 33 ++++++++++++++++++++++++ atomgpt/inverse_models/inverse_models.py | 33 ------------------------ 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/atomgpt/inverse_models/dataset_utils.py b/atomgpt/inverse_models/dataset_utils.py index ef676b1..889260a 100644 --- a/atomgpt/inverse_models/dataset_utils.py +++ b/atomgpt/inverse_models/dataset_utils.py @@ -765,6 +765,39 @@ def _tokenize(example): return dataset pass +def get_input(config=None, chem="", val=10): + if config.chem_info == "none": + prefix = "" + elif config.chem_info == "element_list": + prefix = ( + "The chemical elements are " + + chem # atoms.composition.search_string + + " . " + ) + elif config.chem_info == "element_dict": + prefix = ( + "The chemical contents are " + + chem # atoms.composition.search_string + + " . " + ) + elif config.chem_info == "formula": + prefix = ( + "The chemical formula is " + + chem # atoms.composition.reduced_formula + + " . " + ) + + inp = ( + prefix + + "The " + + config.prop + + " is " + + str(val) + + "." + + config.output_prompt + ) + return inp + def make_alpaca_json( dataset=[], jids=[], diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index c0b634d..d7ee496 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -110,39 +110,6 @@ class TrainingPropConfig(BaseSettings): logging_steps: int = 10 -def get_input(config=None, chem="", val=10): - if config.chem_info == "none": - prefix = "" - elif config.chem_info == "element_list": - prefix = ( - "The chemical elements are " - + chem # atoms.composition.search_string - + " . " - ) - elif config.chem_info == "element_dict": - prefix = ( - "The chemical contents are " - + chem # atoms.composition.search_string - + " . " - ) - elif config.chem_info == "formula": - prefix = ( - "The chemical formula is " - + chem # atoms.composition.reduced_formula - + " . " - ) - - inp = ( - prefix - + "The " - + config.prop - + " is " - + str(val) - + "." - + config.output_prompt - ) - return inp - def load_model(path="", config=None): if config is None: config_file = os.path.join(path, "config.json") From 73e3096a9823b6cc90dbf103eb168d2f6e272ad9 Mon Sep 17 00:00:00 2001 From: ccamp104 Date: Sun, 14 Dec 2025 05:22:38 -0500 Subject: [PATCH 50/50] rm resume from chkpt=true --- atomgpt/inverse_models/inverse_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index d7ee496..e343437 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -553,7 +553,7 @@ def tokenize_function(example): gpu_usage = PrintGPUUsageCallback() trainer.add_callback(gpu_usage) - trainer_stats = trainer.train(resume_from_checkpoint=True) + trainer_stats = trainer.train() trainer.save_model(config.model_save_path) model = trainer.model