From da41d09e2532337cbf0663dabe0ebcd265a13d47 Mon Sep 17 00:00:00 2001
From: Charles Campbell <crc00042@mix.wvu.edu>
Date: Tue, 28 Oct 2025 18:34:09 -0400
Subject: [PATCH 01/50] initialize diffractgpt script

---
 .../ramangpt/dataset_atomgpt_spectra.py       | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py

diff --git a/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py
new file mode 100644
index 0000000..9f984b1
--- /dev/null
+++ b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py
@@ -0,0 +1,24 @@
+import numpy as np
+from atomgpt.inverse_models.utils import smooth_xrd
+from jarvis.io.vasp.inputs import Poscar
+from jarvis.db.figshare import data
+from jarvis.core.atoms import Atoms
+from tqdm import tqdm
+
+d = data("dft_3d")
+# d = data('alex_pbe_hull')
+
+f = open("id_prop.csv", "w")
+count = 0
+for i in tqdm(d, total=len(d)):
+    # if count<10:
+    atoms = Atoms.from_dict(i["atoms"])
+    jid = i["jid"]
+    poscar_name = "POSCAR-" + jid + ".vasp"
+    atoms.write_poscar(poscar_name)
+    y_new_str, cccc = smooth_xrd(atoms=atoms, intvl=0.3, thetas=[0, 90])
+    f.write("%s,%s\n" % (poscar_name, y_new_str))
+    count += 1
+    # if count == max_samples:
+    # break
+f.close()

From bea7d59858faea0fadc8110ea4028640363a6033 Mon Sep 17 00:00:00 2001
From: Charles Campbell <crc00042@mix.wvu.edu>
Date: Tue, 28 Oct 2025 18:39:13 -0400
Subject: [PATCH 02/50] initialize correct dataset script

---
 .../ramangpt/dataset_atomgpt_spectra.py       |  24 ---
 .../ramangpt/dataset_atomgpt_spectra2.py      | 137 ++++++++++++++++++
 2 files changed, 137 insertions(+), 24 deletions(-)
 delete mode 100644 atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py
 create mode 100644 atomgpt/scripts/ramangpt/dataset_atomgpt_spectra2.py

diff --git a/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py
deleted file mode 100644
index 9f984b1..0000000
--- a/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import numpy as np
-from atomgpt.inverse_models.utils import smooth_xrd
-from jarvis.io.vasp.inputs import Poscar
-from jarvis.db.figshare import data
-from jarvis.core.atoms import Atoms
-from tqdm import tqdm
-
-d = data("dft_3d")
-# d = data('alex_pbe_hull')
-
-f = open("id_prop.csv", "w")
-count = 0
-for i in tqdm(d, total=len(d)):
-    # if count<10:
-    atoms = Atoms.from_dict(i["atoms"])
-    jid = i["jid"]
-    poscar_name = "POSCAR-" + jid + ".vasp"
-    atoms.write_poscar(poscar_name)
-    y_new_str, cccc = smooth_xrd(atoms=atoms, intvl=0.3, thetas=[0, 90])
-    f.write("%s,%s\n" % (poscar_name, y_new_str))
-    count += 1
-    # if count == max_samples:
-    # break
-f.close()
diff --git a/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra2.py b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra2.py
new file mode 100644
index 0000000..4751bf1
--- /dev/null
+++ b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra2.py
@@ -0,0 +1,137 @@
+# This is an updated version of diffractgpt using find_peaks algorithm
+import numpy as np
+from scipy.signal import find_peaks
+from jarvis.analysis.diffraction.xrd import XRD
+from jarvis.core.atoms import Atoms
+from jarvis.db.figshare import data
+from tqdm import tqdm
+from jarvis.db.jsonutils import dumpjson
+def get_crystal_string_t(atoms):
+    lengths = atoms.lattice.abc  # structure.lattice.parameters[:3]
+    angles = atoms.lattice.angles
+    atom_ids = atoms.elements
+    frac_coords = atoms.frac_coords
+
+    crystal_str = (
+        " ".join(["{0:.2f}".format(x) for x in lengths])
+        + "\n"
+        + " ".join([str(int(x)) for x in angles])
+        + "\n"
+        + "\n".join(
+            [
+                str(t) + " " + " ".join(["{0:.3f}".format(x) for x in c])
+                for t, c in zip(atom_ids, frac_coords)
+            ]
+        )
+    )
+
+    # crystal_str = atoms_describer(atoms) + "\n*\n" + crystal_str
+    return crystal_str
+
+
+def gaussian_recast(x_original=[], y_original=[], x_new=[], sigma=.1):
+    y_new = np.zeros_like(x_new, dtype=np.float64)
+    for x0, amp in zip(x_original, y_original):
+        y_new += amp * np.exp(-0.5 * ((x_new - x0) / sigma) ** 2)
+    x_new=np.array(x_new)
+    y_new=np.array(y_new)
+    return x_new, y_new
+
+def make_diffractgpt_prompt(atoms, jid='na',thetas=[0, 90], num_peaks=20):
+    """Reads 2θ–intensity data, extracts top N peaks, and builds a prompt for DiffractGPT."""
+    two_theta, d, intensity = XRD(thetas=thetas).simulate(atoms=atoms)
+    intensity = np.array(intensity)
+    intensity /= intensity.max()
+
+    two_theta = np.array(two_theta)
+    x_new = np.arange(0, 90, .1)
+    two_theta,intensity = gaussian_recast(x_original=two_theta,y_original=intensity,x_new=x_new)
+    #print("two_theta",two_theta)
+    #print("intensity",intensity)
+    intensity /= intensity.max()
+    #print(intensity,max(intensity),len(intensity))
+
+    # Find all peaks (with minimal height threshold to ignore noise)
+    peaks, props = find_peaks(intensity, height=0.01, distance=1,prominence=0.05)
+    #print("peaks",peaks)
+    # Get top N peaks by intensity
+    top_indices = np.argsort(props['peak_heights'])[::-1][:num_peaks]
+    top_peaks = peaks[top_indices]
+    top_peaks_sorted = top_peaks[np.argsort(two_theta[top_peaks])]
+
+    # Create list of (2θ, intensity)
+    peak_list = [(round(two_theta[p], 2), round(intensity[p], 2)) for p in top_peaks_sorted]
+
+    # Build DiffractGPT prompt
+    peak_text = ", ".join([f"{t}°({i})" for t, i in peak_list])
+    print(jid,peak_text)
+    num_peaks = len(peaks)
+    formula=atoms.composition.reduced_formula
+    input = (
+        f"The chemical formula is: {formula}.\n"
+        f"The XRD pattern shows main peaks at: {peak_text}.\n"
+        f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types."
+    )
+    output= get_crystal_string_t(atoms)
+    info={}
+    info["instruction"]= "Below is a description of a material."
+    info["input"]=input
+    info['id']=jid
+    info['peak_text']=peak_text
+    info["output"]=output
+    return info
+
+# Example usage
+if __name__ == "__main__":
+    #atoms = Atoms.from_poscar('POSCAR')
+    jids=["JVASP-32","JVASP-15014","JVASP-1002","JVASP-107","JVASP-17957","JVASP-1151"]
+    f=open('id_prop.csv','w')
+    dat=data('dft_3d') #[0:num_samples]
+    test=[]
+    train=[]
+    for i in tqdm(dat,total=len(dat)):
+      if i['jid'] in jids:
+        atoms=Atoms.from_dict(i['atoms'])
+        prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20)
+        filename='POSCAR-'+i['jid']
+        atoms.write_poscar(filename)
+        line=filename+','+prompt['peak_text']+'\n'
+        f.write(line)
+
+
+
+        #print(i['jid'],prompt)
+        train.append(prompt)
+        if i['jid']=="JVASP-32":
+            atoms=Atoms.from_dict(i['atoms'])
+            prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20)
+            #print(i['jid'],prompt)
+            test.append(prompt)
+            train.append(prompt)
+        if i['jid']=="JVASP-15014":
+            atoms=Atoms.from_dict(i['atoms'])
+            prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20)
+            #print(i['jid'],prompt)
+            test.append(prompt)
+            train.append(prompt)
+        if i['jid']=="JVASP-1002":
+            atoms=Atoms.from_dict(i['atoms'])
+            prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20)
+            #print(i['jid'],prompt)
+            test.append(prompt)
+            train.append(prompt)
+        if i['jid']=="JVASP-107":
+            atoms=Atoms.from_dict(i['atoms'])
+            prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20)
+            #print(i['jid'],prompt)
+            test.append(prompt)
+            train.append(prompt)
+        if i['jid']=="JVASP-17957":
+            atoms=Atoms.from_dict(i['atoms'])
+            prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20)
+            #print(i['jid'],prompt)
+            test.append(prompt)
+            train.append(prompt)
+    f.close()
+    dumpjson(data=train,filename='alpaca_prop_train.json')
+    dumpjson(data=test,filename='alpaca_prop_test.json')

From 5e6633b7bd8e865eff0a65cedc17ed9b1ea35387 Mon Sep 17 00:00:00 2001
From: crhysc <crc00042@mix.wvu.edu>
Date: Wed, 29 Oct 2025 13:30:45 -0400
Subject: [PATCH 03/50] initialize code to make train test alpaca jsons

---
 atomgpt/scripts/ramangpt/alpaca_train_test.py | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 atomgpt/scripts/ramangpt/alpaca_train_test.py

diff --git a/atomgpt/scripts/ramangpt/alpaca_train_test.py b/atomgpt/scripts/ramangpt/alpaca_train_test.py
new file mode 100644
index 0000000..dc10970
--- /dev/null
+++ b/atomgpt/scripts/ramangpt/alpaca_train_test.py
@@ -0,0 +1,55 @@
+import argparse
+import json
+from pathlib import Path
+import numpy as np
+from scipy.signal import find_peaks
+from jarvis.core.atoms import Atoms
+from tqdm import tqdm
+import csv
+import pandas as pd
+
+def get_crystal_string_t(atoms):
+    lengths = atoms.lattice.abc
+    angles = atoms.lattice.angles
+    atom_ids = atoms.elements
+    frac_coords = atoms.frac_coords
+
+    crystal_str = (
+        " ".join(["{0:.2f}".format(x) for x in lengths])
+        + "\n"
+        + " ".join([str(int(x)) for x in angles])
+        + "\n"
+        + "\n".join(
+            [
+                str(t) + " " + " ".join(["{0:.3f}".format(x) for x in c])
+                for t, c in zip(atom_ids, frac_coords)
+            ]
+        )
+    )
+    return crystal_str
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("raman_json_path")
+    args = parser.parse_args()
+    df = pd.DataFrame()
+    with open(args.raman_json_path, mode="r", encoding="utf-8") as file:
+        data = json.load(file)
+    atoms = []
+    for obj in data:
+        atoms = Atoms(
+            lattice_mat=data[obj]['atoms']['lattice_mat'],
+            coords=data[obj]['atoms']['coords'],
+            elements=data[obj]['atoms']['elements']
+            )
+        jid = data[obj]['id']
+        intensities = data[obj]['raman_activity']
+        frequencies = data[obj]['freq_cm']
+        
+
+
+
+
+
+if __name__ == '__main__':
+    main()

From a6d7842ba632cce5859e36abd69ac7db6b156b9e Mon Sep 17 00:00:00 2001
From: crhysc <crc00042@mix.wvu.edu>
Date: Wed, 29 Oct 2025 15:48:55 -0400
Subject: [PATCH 04/50] version 1. should make sentences

---
 atomgpt/scripts/ramangpt/alpaca_train_test.py | 170 ++++++++++++++----
 1 file changed, 139 insertions(+), 31 deletions(-)

diff --git a/atomgpt/scripts/ramangpt/alpaca_train_test.py b/atomgpt/scripts/ramangpt/alpaca_train_test.py
index dc10970..516d8e3 100644
--- a/atomgpt/scripts/ramangpt/alpaca_train_test.py
+++ b/atomgpt/scripts/ramangpt/alpaca_train_test.py
@@ -1,55 +1,163 @@
+#!/usr/bin/env python3
+# make_raman_alpaca.py
+#
+# Read Raman JSON, keep only modes with activity > 0, format activities to N decimals,
+# and write Alpaca-style train/test JSONs compatible with your finetuning script.
+
 import argparse
 import json
+import random
 from pathlib import Path
+
 import numpy as np
-from scipy.signal import find_peaks
-from jarvis.core.atoms import Atoms
 from tqdm import tqdm
-import csv
-import pandas as pd
+from jarvis.core.atoms import Atoms
+from jarvis.db.jsonutils import dumpjson
 
-def get_crystal_string_t(atoms):
+
+def get_crystal_string_t(atoms: Atoms) -> str:
     lengths = atoms.lattice.abc
     angles = atoms.lattice.angles
     atom_ids = atoms.elements
     frac_coords = atoms.frac_coords
-
     crystal_str = (
-        " ".join(["{0:.2f}".format(x) for x in lengths])
+        " ".join("{0:.2f}".format(x) for x in lengths)
         + "\n"
-        + " ".join([str(int(x)) for x in angles])
+        + " ".join(str(int(x)) for x in angles)
         + "\n"
         + "\n".join(
-            [
-                str(t) + " " + " ".join(["{0:.3f}".format(x) for x in c])
-                for t, c in zip(atom_ids, frac_coords)
-            ]
+            f"{t} " + " ".join("{0:.3f}".format(x) for x in c)
+            for t, c in zip(atom_ids, frac_coords)
         )
     )
     return crystal_str
 
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("raman_json_path")
-    args = parser.parse_args()
-    df = pd.DataFrame()
-    with open(args.raman_json_path, mode="r", encoding="utf-8") as file:
-        data = json.load(file)
-    atoms = []
-    for obj in data:
-        atoms = Atoms(
-            lattice_mat=data[obj]['atoms']['lattice_mat'],
-            coords=data[obj]['atoms']['coords'],
-            elements=data[obj]['atoms']['elements']
-            )
-        jid = data[obj]['id']
-        intensities = data[obj]['raman_activity']
-        frequencies = data[obj]['freq_cm']
-        
 
+def format_fixed_decimals(val: float, decimals: int = 6) -> str:
+    """Format a number with fixed decimal places (handles scientific-notation inputs)."""
+    try:
+        v = float(val)
+    except Exception:
+        v = np.nan
+    if not np.isfinite(v):
+        return "0"  # safe fallback
+    return f"{v:.{decimals}f}"
+
+
+def make_raman_record(
+    entry: dict,
+    freq_decimals: int = 2,
+    activity_decimals: int = 6,
+) -> dict | None:
+    atoms_dict = entry.get("atoms")
+    if not atoms_dict:
+        return None
+
+    try:
+        atoms = Atoms.from_dict(atoms_dict)
+    except Exception:
+        return None
+
+    try:
+        formula = atoms.composition.reduced_formula
+    except Exception:
+        formula = "Unknown"
+
+    # Coerce to float; handles numbers or strings like "7.88E-7"
+    freqs = np.array(entry.get("freq_cm", []), dtype=float)
+    acts = np.array(entry.get("raman_activity", []), dtype=float)
+
+    if freqs.size == 0 or acts.size == 0 or freqs.size != acts.size:
+        return None
+
+    mask = acts > 0.0
+    if not np.any(mask):
+        return None
+
+    freqs_nz = freqs[mask]
+    acts_nz = acts[mask]
 
+    order = np.argsort(freqs_nz)
+    freqs_nz = freqs_nz[order]
+    acts_nz = acts_nz[order]
 
+    fmt_f = f"{{0:.{freq_decimals}f}}"
+    pairs = [
+        f"{fmt_f.format(float(freq))} cm^-1({format_fixed_decimals(float(act), activity_decimals)})"
+        for freq, act in zip(freqs_nz, acts_nz)
+    ]
+    raman_text = ", ".join(pairs)
 
+    rec = {
+        "instruction": "Below is a description of a material.",
+        "input": (
+            f"The chemical formula is: {formula}.\n"
+            f"The Raman spectrum shows active modes at: {raman_text}.\n"
+            f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types."
+        ),
+        "output": get_crystal_string_t(atoms),
+        "id": entry.get("id", "na"),   # kept in BOTH train & test for your evaluator
+        "raman_text": raman_text,       # extra field; trainer ignores it
+    }
+    return rec
 
-if __name__ == '__main__':
+
+def main():
+    p = argparse.ArgumentParser(
+        description="Build Alpaca train/test JSONs from a Raman spectroscopy dataset."
+    )
+    p.add_argument("--raman-json", type=Path, required=True,
+                   help="Path to Raman JSON file (list of entries).")
+    p.add_argument("--test-ratio", type=float, default=0.1,
+                   help="Fraction for test split (default: 0.10).")
+    p.add_argument("--seed", type=int, default=42,
+                   help="Random seed for the split (default: 42).")
+    p.add_argument("--train-out", type=Path, default=Path("alpaca_prop_train.json"),
+                   help="Output path for train JSON.")
+    p.add_argument("--test-out", type=Path, default=Path("alpaca_prop_test.json"),
+                   help="Output path for test JSON.")
+    p.add_argument("--freq-decimals", type=int, default=2,
+                   help="Decimals for frequencies in cm^-1 (default: 2).")
+    p.add_argument("--activity-decimals", type=int, default=6,
+                   help="Decimals for Raman activities (default: 6).")
+    args = p.parse_args()
+
+    with args.raman_json.open("r", encoding="utf-8") as f:
+        raw = json.load(f)
+
+    records = []
+    for entry in tqdm(raw, total=len(raw), desc="Processing Raman entries"):
+        rec = make_raman_record(
+            entry,
+            freq_decimals=args.freq_decimals,
+            activity_decimals=args.activity_decimals,
+        )
+        if rec is not None:
+            records.append(rec)
+
+    if not records:
+        raise SystemExit("No valid records with nonzero Raman activity were found.")
+
+    rng = random.Random(args.seed)
+    rng.shuffle(records)
+    n_total = len(records)
+    n_test = max(1, int(round(args.test_ratio * n_total)))
+    test = records[:n_test]
+    train = records[n_test:]
+
+    dumpjson(data=train, filename=str(args.train_out))
+    dumpjson(data=test, filename=str(args.test_out))
+
+    print(f"Wrote {len(train)} train records → {args.train_out}")
+    print(f"Wrote {len(test)}  test records → {args.test_out}")
+
+    # Quick compatibility check for the evaluator (needs id/input/output)
+    ex = test[0]
+    for k in ("id", "instruction", "input", "output"):
+        if k not in ex:
+            print(f"WARNING: key '{k}' missing from test example!")
+
+
+if __name__ == "__main__":
     main()
+

From fbb0c3acabf1b41161e589e4e061f4d76177cc28 Mon Sep 17 00:00:00 2001
From: crhysc <crc00042@mix.wvu.edu>
Date: Wed, 29 Oct 2025 15:54:26 -0400
Subject: [PATCH 05/50] initialize runner

---
 atomgpt/scripts/ramangpt/runner.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 atomgpt/scripts/ramangpt/runner.sh

diff --git a/atomgpt/scripts/ramangpt/runner.sh b/atomgpt/scripts/ramangpt/runner.sh
new file mode 100644
index 0000000..bc4e309
--- /dev/null
+++ b/atomgpt/scripts/ramangpt/runner.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+python make_raman_alpaca.py \
+  --raman-json ramandb.json \
+  --test-ratio 0.1 \
+  --train-out alpaca_prop_train.json \
+  --test-out alpaca_prop_test.json \
+  --freq-decimals 9 \
+  --activity-decimals 6
+

From fae7eee45c4c7e19f578acd92ac2dba0db8f2b2c Mon Sep 17 00:00:00 2001
From: Charles Campbell <crc00042@mix.wvu.edu>
Date: Wed, 29 Oct 2025 16:04:25 -0400
Subject: [PATCH 06/50] name change

---
 .../ramangpt/{alpaca_train_test.py => make_raman_alpaca.py}       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename atomgpt/scripts/ramangpt/{alpaca_train_test.py => make_raman_alpaca.py} (100%)

diff --git a/atomgpt/scripts/ramangpt/alpaca_train_test.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
similarity index 100%
rename from atomgpt/scripts/ramangpt/alpaca_train_test.py
rename to atomgpt/scripts/ramangpt/make_raman_alpaca.py

From b7a9cde7c4d700e98b2f60015cb753600e2417e2 Mon Sep 17 00:00:00 2001
From: Charles Campbell <crc00042@mix.wvu.edu>
Date: Wed, 29 Oct 2025 16:12:55 -0400
Subject: [PATCH 07/50] update prompt to include ()

---
 atomgpt/scripts/ramangpt/make_raman_alpaca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
index 516d8e3..5529fb1 100644
--- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py
+++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
@@ -92,7 +92,7 @@ def make_raman_record(
         "instruction": "Below is a description of a material.",
         "input": (
             f"The chemical formula is: {formula}.\n"
-            f"The Raman spectrum shows active modes at: {raman_text}.\n"
+            f"The Raman spectrum shows active modes with normalized intensities () at: {raman_text}.\n"
             f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types."
         ),
         "output": get_crystal_string_t(atoms),

From ff64e8361b6f745b0371770ac9aa779e1eb8e327 Mon Sep 17 00:00:00 2001
From: Charles Campbell <crc00042@mix.wvu.edu>
Date: Wed, 29 Oct 2025 16:17:16 -0400
Subject: [PATCH 08/50] add cm^-1 to prompt

---
 atomgpt/scripts/ramangpt/make_raman_alpaca.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
index 5529fb1..fcedb48 100644
--- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py
+++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
@@ -83,7 +83,7 @@ def make_raman_record(
 
     fmt_f = f"{{0:.{freq_decimals}f}}"
     pairs = [
-        f"{fmt_f.format(float(freq))} cm^-1({format_fixed_decimals(float(act), activity_decimals)})"
+        f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act), activity_decimals)})"
         for freq, act in zip(freqs_nz, acts_nz)
     ]
     raman_text = ", ".join(pairs)
@@ -92,7 +92,7 @@ def make_raman_record(
         "instruction": "Below is a description of a material.",
         "input": (
             f"The chemical formula is: {formula}.\n"
-            f"The Raman spectrum shows active modes with normalized intensities () at: {raman_text}.\n"
+            f"The Raman spectrum shows active modes in cm^-1 with normalized intensities () at: {raman_text}.\n"
             f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types."
         ),
         "output": get_crystal_string_t(atoms),

From 3b987f8e9d25e147d9d3b34560a1cf467d31fd83 Mon Sep 17 00:00:00 2001
From: Charles Campbell <crc00042@mix.wvu.edu>
Date: Fri, 31 Oct 2025 14:44:37 -0400
Subject: [PATCH 09/50] remove rounding bug

---
 atomgpt/scripts/ramangpt/make_raman_alpaca.py | 118 ++----------------
 1 file changed, 8 insertions(+), 110 deletions(-)

diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
index fcedb48..dd2e8b4 100644
--- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py
+++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
@@ -1,49 +1,3 @@
-#!/usr/bin/env python3
-# make_raman_alpaca.py
-#
-# Read Raman JSON, keep only modes with activity > 0, format activities to N decimals,
-# and write Alpaca-style train/test JSONs compatible with your finetuning script.
-
-import argparse
-import json
-import random
-from pathlib import Path
-
-import numpy as np
-from tqdm import tqdm
-from jarvis.core.atoms import Atoms
-from jarvis.db.jsonutils import dumpjson
-
-
-def get_crystal_string_t(atoms: Atoms) -> str:
-    lengths = atoms.lattice.abc
-    angles = atoms.lattice.angles
-    atom_ids = atoms.elements
-    frac_coords = atoms.frac_coords
-    crystal_str = (
-        " ".join("{0:.2f}".format(x) for x in lengths)
-        + "\n"
-        + " ".join(str(int(x)) for x in angles)
-        + "\n"
-        + "\n".join(
-            f"{t} " + " ".join("{0:.3f}".format(x) for x in c)
-            for t, c in zip(atom_ids, frac_coords)
-        )
-    )
-    return crystal_str
-
-
-def format_fixed_decimals(val: float, decimals: int = 6) -> str:
-    """Format a number with fixed decimal places (handles scientific-notation inputs)."""
-    try:
-        v = float(val)
-    except Exception:
-        v = np.nan
-    if not np.isfinite(v):
-        return "0"  # safe fallback
-    return f"{v:.{decimals}f}"
-
-
 def make_raman_record(
     entry: dict,
     freq_decimals: int = 2,
@@ -70,7 +24,11 @@ def make_raman_record(
     if freqs.size == 0 or acts.size == 0 or freqs.size != acts.size:
         return None
 
-    mask = acts > 0.0
+    # NEW: drop NaN/inf, then drop anything that *rounds* to 0.00... at the chosen precision
+    acts = np.where(np.isfinite(acts), acts, 0.0)
+    rounded = np.round(acts, decimals=activity_decimals)
+    mask = rounded != 0.0  # this also excludes "-0.00"
+
     if not np.any(mask):
         return None
 
@@ -83,7 +41,7 @@ def make_raman_record(
 
     fmt_f = f"{{0:.{freq_decimals}f}}"
     pairs = [
-        f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act), activity_decimals)})"
+        f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act), activity_decicals)})"
         for freq, act in zip(freqs_nz, acts_nz)
     ]
     raman_text = ", ".join(pairs)
@@ -96,68 +54,8 @@ def make_raman_record(
             f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types."
         ),
         "output": get_crystal_string_t(atoms),
-        "id": entry.get("id", "na"),   # kept in BOTH train & test for your evaluator
-        "raman_text": raman_text,       # extra field; trainer ignores it
+        "id": entry.get("id", "na"),
+        "raman_text": raman_text,
     }
     return rec
 
-
-def main():
-    p = argparse.ArgumentParser(
-        description="Build Alpaca train/test JSONs from a Raman spectroscopy dataset."
-    )
-    p.add_argument("--raman-json", type=Path, required=True,
-                   help="Path to Raman JSON file (list of entries).")
-    p.add_argument("--test-ratio", type=float, default=0.1,
-                   help="Fraction for test split (default: 0.10).")
-    p.add_argument("--seed", type=int, default=42,
-                   help="Random seed for the split (default: 42).")
-    p.add_argument("--train-out", type=Path, default=Path("alpaca_prop_train.json"),
-                   help="Output path for train JSON.")
-    p.add_argument("--test-out", type=Path, default=Path("alpaca_prop_test.json"),
-                   help="Output path for test JSON.")
-    p.add_argument("--freq-decimals", type=int, default=2,
-                   help="Decimals for frequencies in cm^-1 (default: 2).")
-    p.add_argument("--activity-decimals", type=int, default=6,
-                   help="Decimals for Raman activities (default: 6).")
-    args = p.parse_args()
-
-    with args.raman_json.open("r", encoding="utf-8") as f:
-        raw = json.load(f)
-
-    records = []
-    for entry in tqdm(raw, total=len(raw), desc="Processing Raman entries"):
-        rec = make_raman_record(
-            entry,
-            freq_decimals=args.freq_decimals,
-            activity_decimals=args.activity_decimals,
-        )
-        if rec is not None:
-            records.append(rec)
-
-    if not records:
-        raise SystemExit("No valid records with nonzero Raman activity were found.")
-
-    rng = random.Random(args.seed)
-    rng.shuffle(records)
-    n_total = len(records)
-    n_test = max(1, int(round(args.test_ratio * n_total)))
-    test = records[:n_test]
-    train = records[n_test:]
-
-    dumpjson(data=train, filename=str(args.train_out))
-    dumpjson(data=test, filename=str(args.test_out))
-
-    print(f"Wrote {len(train)} train records → {args.train_out}")
-    print(f"Wrote {len(test)}  test records → {args.test_out}")
-
-    # Quick compatibility check for the evaluator (needs id/input/output)
-    ex = test[0]
-    for k in ("id", "instruction", "input", "output"):
-        if k not in ex:
-            print(f"WARNING: key '{k}' missing from test example!")
-
-
-if __name__ == "__main__":
-    main()
-

From 2425df924f24751635f0f7ebfa5fc1aa4459599e Mon Sep 17 00:00:00 2001
From: Charles Campbell <crc00042@mix.wvu.edu>
Date: Fri, 31 Oct 2025 14:49:48 -0400
Subject: [PATCH 10/50] rest of the script

---
 atomgpt/scripts/ramangpt/make_raman_alpaca.py | 133 ++++++++++++++++--
 1 file changed, 122 insertions(+), 11 deletions(-)

diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
index dd2e8b4..ffa1a4c 100644
--- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py
+++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
@@ -1,3 +1,50 @@
+#!/usr/bin/env python3
+# make_raman_alpaca.py
+#
+# Read Raman JSON, keep only modes whose activity is non-zero AFTER rounding
+# to the requested precision (so "0.000000" modes are dropped), format values,
+# and write Alpaca-style train/test JSONs compatible with your finetuning script.
+
+import argparse
+import json
+import random
+from pathlib import Path
+
+import numpy as np
+from tqdm import tqdm
+from jarvis.core.atoms import Atoms
+from jarvis.db.jsonutils import dumpjson
+
+
+def get_crystal_string_t(atoms: Atoms) -> str:
+    lengths = atoms.lattice.abc
+    angles = atoms.lattice.angles
+    atom_ids = atoms.elements
+    frac_coords = atoms.frac_coords
+    crystal_str = (
+        " ".join("{0:.2f}".format(x) for x in lengths)
+        + "\n"
+        + " ".join(str(int(x)) for x in angles)
+        + "\n"
+        + "\n".join(
+            f"{t} " + " ".join("{0:.3f}".format(x) for x in c)
+            for t, c in zip(atom_ids, frac_coords)
+        )
+    )
+    return crystal_str
+
+
+def format_fixed_decimals(val: float, decimals: int = 6) -> str:
+    """Format a number with fixed decimal places (handles scientific-notation inputs)."""
+    try:
+        v = float(val)
+    except Exception:
+        return "0"
+    if not np.isfinite(v):
+        return "0"
+    return f"{v:.{decimals}f}"
+
+
 def make_raman_record(
     entry: dict,
     freq_decimals: int = 2,
@@ -24,25 +71,29 @@ def make_raman_record(
     if freqs.size == 0 or acts.size == 0 or freqs.size != acts.size:
         return None
 
-    # NEW: drop NaN/inf, then drop anything that *rounds* to 0.00... at the chosen precision
+    # Drop non-finite, then exclude anything that *appears* as 0.00... after rounding
     acts = np.where(np.isfinite(acts), acts, 0.0)
-    rounded = np.round(acts, decimals=activity_decimals)
-    mask = rounded != 0.0  # this also excludes "-0.00"
+    acts_rounded = np.round(acts, decimals=activity_decimals)
+    keep_mask = acts_rounded != 0.0  # also drops "-0.0"
 
-    if not np.any(mask):
+    if not np.any(keep_mask):
         return None
 
-    freqs_nz = freqs[mask]
-    acts_nz = acts[mask]
+    freqs_kept = freqs[keep_mask]
+    acts_kept = acts[keep_mask]
+    acts_rounded_kept = acts_rounded[keep_mask]
 
-    order = np.argsort(freqs_nz)
-    freqs_nz = freqs_nz[order]
-    acts_nz = acts_nz[order]
+    # Sort by frequency
+    order = np.argsort(freqs_kept)
+    freqs_kept = freqs_kept[order]
+    acts_kept = acts_kept[order]
+    acts_rounded_kept = acts_rounded_kept[order]
 
+    # Format output strings
     fmt_f = f"{{0:.{freq_decimals}f}}"
     pairs = [
-        f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act), activity_decicals)})"
-        for freq, act in zip(freqs_nz, acts_nz)
+        f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act_r), activity_decimals)})"
+        for freq, act_r in zip(freqs_kept, acts_rounded_kept)
     ]
     raman_text = ", ".join(pairs)
 
@@ -59,3 +110,63 @@ def make_raman_record(
     }
     return rec
 
+
+def main():
+    p = argparse.ArgumentParser(
+        description="Build Alpaca train/test JSONs from a Raman spectroscopy dataset."
+    )
+    p.add_argument("--raman-json", type=Path, required=True,
+                   help="Path to Raman JSON file (list of entries).")
+    p.add_argument("--test-ratio", type=float, default=0.1,
+                   help="Fraction for test split (default: 0.10).")
+    p.add_argument("--seed", type=int, default=42,
+                   help="Random seed for the split (default: 42).")
+    p.add_argument("--train-out", type=Path, default=Path("alpaca_prop_train.json"),
+                   help="Output path for train JSON.")
+    p.add_argument("--test-out", type=Path, default=Path("alpaca_prop_test.json"),
+                   help="Output path for test JSON.")
+    p.add_argument("--freq-decimals", type=int, default=2,
+                   help="Decimals for frequencies in cm^-1 (default: 2).")
+    p.add_argument("--activity-decimals", type=int, default=6,
+                   help="Decimals for Raman activities (default: 6).")
+    args = p.parse_args()
+
+    with args.raman_json.open("r", encoding="utf-8") as f:
+        raw = json.load(f)
+
+    records = []
+    for entry in tqdm(raw, total=len(raw), desc="Processing Raman entries"):
+        rec = make_raman_record(
+            entry,
+            freq_decimals=args.freq_decimals,
+            activity_decimals=args.activity_decimals,
+        )
+        if rec is not None:
+            records.append(rec)
+
+    if not records:
+        raise SystemExit("No valid records with nonzero Raman activity (after rounding) were found.")
+
+    rng = random.Random(args.seed)
+    rng.shuffle(records)
+    n_total = len(records)
+    n_test = max(1, int(round(args.test_ratio * n_total)))
+    test = records[:n_test]
+    train = records[n_test:]
+
+    dumpjson(data=train, filename=str(args.train_out))
+    dumpjson(data=test, filename=str(args.test_out))
+
+    print(f"Wrote {len(train)} train records → {args.train_out}")
+    print(f"Wrote {len(test)}  test records → {args.test_out}")
+
+    # Quick compatibility check for the evaluator (needs id/input/output)
+    ex = test[0]
+    for k in ("id", "instruction", "input", "output"):
+        if k not in ex:
+            print(f"WARNING: key '{k}' missing from test example!")
+
+
+if __name__ == "__main__":
+    main()
+

From 9f0b3c2a4607303ae8cc408cf0cce44c2093bb32 Mon Sep 17 00:00:00 2001
From: Charles Campbell <crc00042@mix.wvu.edu>
Date: Fri, 31 Oct 2025 15:35:15 -0400
Subject: [PATCH 11/50] freq normal and niggli reduce

---
 atomgpt/scripts/ramangpt/make_raman_alpaca.py | 86 +++++++++++++++----
 1 file changed, 70 insertions(+), 16 deletions(-)

diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
index ffa1a4c..12338fa 100644
--- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py
+++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 # make_raman_alpaca.py
 #
-# Read Raman JSON, keep only modes whose activity is non-zero AFTER rounding
-# to the requested precision (so "0.000000" modes are dropped), format values,
-# and write Alpaca-style train/test JSONs compatible with your finetuning script.
+# Read Raman JSON, optionally Niggli-reduce cells, keep only modes whose activity
+# is non-zero AFTER rounding to the requested precision, optionally normalize
+# frequencies to [0,1] (1.0 = max kept freq), format values, and write
+# Alpaca-style train/test JSONs.
 
 import argparse
 import json
@@ -34,6 +35,31 @@ def get_crystal_string_t(atoms: Atoms) -> str:
     return crystal_str
 
 
+def niggli_reduce_atoms(atoms: Atoms) -> Atoms:
+    """
+    Try to Niggli-reduce using pymatgen (preferred).
+    Falls back to returning the original atoms if reduction fails or pymatgen is absent.
+    """
+    try:
+        from pymatgen.core import Structure, Lattice  # lazy import
+        species = list(atoms.elements)  # per-site symbols
+        frac = np.array(atoms.frac_coords, dtype=float)
+        lat = np.array(atoms.lattice.matrix, dtype=float)
+        pmg = Structure(Lattice(lat), species, frac, coords_are_cartesian=False)
+
+        # Niggli reduction on the full structure (updates lattice + fractional coords)
+        reduced, _ = pmg.get_reduced_structure(reduction_algo="niggli")
+        return Atoms(
+            lattice_mat=np.array(reduced.lattice.matrix),
+            coords=np.array(reduced.frac_coords),
+            elements=[str(s) for s in reduced.species],
+            cartesian=False,
+        )
+    except Exception:
+        # Best-effort fallback: return original if anything goes wrong
+        return atoms
+
+
 def format_fixed_decimals(val: float, decimals: int = 6) -> str:
     """Format a number with fixed decimal places (handles scientific-notation inputs)."""
     try:
@@ -49,6 +75,8 @@ def make_raman_record(
     entry: dict,
     freq_decimals: int = 2,
     activity_decimals: int = 6,
+    normalize_freq: bool = False,
+    niggli: bool = False,
 ) -> dict | None:
     atoms_dict = entry.get("atoms")
     if not atoms_dict:
@@ -59,6 +87,10 @@ def make_raman_record(
     except Exception:
         return None
 
+    # Optional Niggli reduction BEFORE anything else
+    if niggli:
+        atoms = niggli_reduce_atoms(atoms)
+
     try:
         formula = atoms.composition.reduced_formula
     except Exception:
@@ -80,30 +112,45 @@ def make_raman_record(
         return None
 
     freqs_kept = freqs[keep_mask]
-    acts_kept = acts[keep_mask]
     acts_rounded_kept = acts_rounded[keep_mask]
 
-    # Sort by frequency
-    order = np.argsort(freqs_kept)
-    freqs_kept = freqs_kept[order]
-    acts_kept = acts_kept[order]
+    # Optional normalize frequencies to [0,1], with 1.0 = max kept frequency
+    if normalize_freq:
+        max_f = float(np.max(freqs_kept)) if freqs_kept.size else 0.0
+        if max_f > 0.0:
+            freqs_display = freqs_kept / max_f  # zero maps to 0.0, max -> 1.0
+        else:
+            freqs_display = np.zeros_like(freqs_kept)
+        freq_unit_caption = "normalized frequency 0–1"
+    else:
+        freqs_display = freqs_kept
+        freq_unit_caption = "cm^-1"
+
+    # Sort by *display* frequency so ordering matches what we print
+    order = np.argsort(freqs_display)
+    freqs_display = freqs_display[order]
+    freqs_kept = freqs_kept[order]  # keep original too, in case needed later
     acts_rounded_kept = acts_rounded_kept[order]
 
     # Format output strings
     fmt_f = f"{{0:.{freq_decimals}f}}"
     pairs = [
-        f"{fmt_f.format(float(freq))} ({format_fixed_decimals(float(act_r), activity_decimals)})"
-        for freq, act_r in zip(freqs_kept, acts_rounded_kept)
+        f"{fmt_f.format(float(fd))} ({format_fixed_decimals(float(act_r), activity_decimals)})"
+        for fd, act_r in zip(freqs_display, acts_rounded_kept)
     ]
     raman_text = ", ".join(pairs)
 
+    # Build prompt text
+    input_header = (
+        f"The chemical formula is: {formula}.\n"
+        f"The Raman spectrum shows active modes in {freq_unit_caption} "
+        f"with normalized intensities () at: {raman_text}.\n"
+        f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types."
+    )
+
     rec = {
         "instruction": "Below is a description of a material.",
-        "input": (
-            f"The chemical formula is: {formula}.\n"
-            f"The Raman spectrum shows active modes in cm^-1 with normalized intensities () at: {raman_text}.\n"
-            f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types."
-        ),
+        "input": input_header,
         "output": get_crystal_string_t(atoms),
         "id": entry.get("id", "na"),
         "raman_text": raman_text,
@@ -126,9 +173,13 @@ def main():
     p.add_argument("--test-out", type=Path, default=Path("alpaca_prop_test.json"),
                    help="Output path for test JSON.")
     p.add_argument("--freq-decimals", type=int, default=2,
-                   help="Decimals for frequencies in cm^-1 (default: 2).")
+                   help="Decimals for frequencies (cm^-1 or normalized), default: 2.")
     p.add_argument("--activity-decimals", type=int, default=6,
                    help="Decimals for Raman activities (default: 6).")
+    p.add_argument("--normalize-freq", action="store_true",
+                   help="Normalize frequencies to [0,1]; 1.0 = max kept frequency after intensity rounding.")
+    p.add_argument("--niggli-reduce", action="store_true",
+                   help="Apply Niggli reduction to each cell before partitioning into train/test.")
     args = p.parse_args()
 
     with args.raman_json.open("r", encoding="utf-8") as f:
@@ -140,6 +191,8 @@ def main():
             entry,
             freq_decimals=args.freq_decimals,
             activity_decimals=args.activity_decimals,
+            normalize_freq=args.normalize_freq,
+            niggli=args.niggli_reduce,
         )
         if rec is not None:
             records.append(rec)
@@ -147,6 +200,7 @@ def main():
     if not records:
         raise SystemExit("No valid records with nonzero Raman activity (after rounding) were found.")
 
+    # Shuffle & split AFTER optional Niggli reduction (as requested)
     rng = random.Random(args.seed)
     rng.shuffle(records)
     n_total = len(records)

From 2771adf70cc4ae5f2a49d66fc56e8ff0ec71a3c8 Mon Sep 17 00:00:00 2001
From: Charles Campbell <crc00042@mix.wvu.edu>
Date: Tue, 4 Nov 2025 00:21:22 -0500
Subject: [PATCH 12/50] add freq upper bound

---
 atomgpt/scripts/ramangpt/make_raman_alpaca.py | 61 ++++++++++++++-----
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
index 12338fa..8cfd65b 100644
--- a/atomgpt/scripts/ramangpt/make_raman_alpaca.py
+++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py
@@ -18,21 +18,37 @@
 
 
 def get_crystal_string_t(atoms: Atoms) -> str:
-    lengths = atoms.lattice.abc
-    angles = atoms.lattice.angles
-    atom_ids = atoms.elements
-    frac_coords = atoms.frac_coords
-    crystal_str = (
-        " ".join("{0:.2f}".format(x) for x in lengths)
-        + "\n"
-        + " ".join(str(int(x)) for x in angles)
-        + "\n"
-        + "\n".join(
-            f"{t} " + " ".join("{0:.3f}".format(x) for x in c)
-            for t, c in zip(atom_ids, frac_coords)
-        )
+    # Lattice
+    lengths = np.array(atoms.lattice.abc, dtype=float).ravel()
+    angles  = np.array(atoms.lattice.angles, dtype=float).ravel()
+
+    # Per-site species and fractional coordinates; force shape (N, 3)
+    atom_ids = [str(x) for x in list(atoms.elements)]
+    frac = np.asarray(atoms.frac_coords, dtype=float)
+    if frac.ndim == 1:
+        if frac.size == 3:
+            frac = frac.reshape(1, 3)
+        else:
+            raise ValueError(f"Unexpected fractional coord shape: {frac.shape}")
+    elif frac.ndim == 2 and frac.shape[1] != 3:
+        raise ValueError(f"Expected frac coords with 3 columns, got {frac.shape}")
+
+    # If species length doesn't match coords, broadcast a single species tag
+    if len(atom_ids) != len(frac):
+        if len(atom_ids) == 1 and len(frac) > 1:
+            atom_ids = atom_ids * len(frac)
+        else:
+            raise ValueError(
+                f"Elements length ({len(atom_ids)}) != coords length ({len(frac)})"
+            )
+
+    lengths_str = " ".join(f"{x:.2f}" for x in lengths.tolist())
+    angles_str  = " ".join(f"{x:.2f}" for x in angles.tolist())
+    coords_str  = "\n".join(
+        f"{t} " + " ".join(f"{c:.3f}" for c in row.tolist())
+        for t, row in zip(atom_ids, frac)
     )
-    return crystal_str
+    return f"{lengths_str}\n{angles_str}\n{coords_str}"
 
 
 def niggli_reduce_atoms(atoms: Atoms) -> Atoms:
@@ -77,6 +93,7 @@ def make_raman_record(
     activity_decimals: int = 6,
     normalize_freq: bool = False,
     niggli: bool = False,
+    include_max_freq: bool = False,
 ) -> dict | None:
     atoms_dict = entry.get("atoms")
     if not atoms_dict:
@@ -115,8 +132,8 @@ def make_raman_record(
     acts_rounded_kept = acts_rounded[keep_mask]
 
     # Optional normalize frequencies to [0,1], with 1.0 = max kept frequency
+    max_f = float(np.max(freqs_kept)) if freqs_kept.size else 0.0
     if normalize_freq:
-        max_f = float(np.max(freqs_kept)) if freqs_kept.size else 0.0
         if max_f > 0.0:
             freqs_display = freqs_kept / max_f  # zero maps to 0.0, max -> 1.0
         else:
@@ -141,10 +158,17 @@ def make_raman_record(
     raman_text = ", ".join(pairs)
 
     # Build prompt text
+    extra_norm_line = ""
+    if normalize_freq and include_max_freq and max_f > 0.0:
+        extra_norm_line = (
+            f"\nNormalization reference: 1.00 corresponds to "
+            f"{fmt_f.format(max_f)} cm^-1."
+        )
+
     input_header = (
         f"The chemical formula is: {formula}.\n"
         f"The Raman spectrum shows active modes in {freq_unit_caption} "
-        f"with normalized intensities () at: {raman_text}.\n"
+        f"with normalized intensities () at: {raman_text}.{extra_norm_line}\n"
         f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types."
     )
 
@@ -155,6 +179,8 @@ def make_raman_record(
         "id": entry.get("id", "na"),
         "raman_text": raman_text,
     }
+    if normalize_freq and include_max_freq:
+        rec["max_freq_cm"] = float(max_f)
     return rec
 
 
@@ -178,6 +204,8 @@ def main():
                    help="Decimals for Raman activities (default: 6).")
     p.add_argument("--normalize-freq", action="store_true",
                    help="Normalize frequencies to [0,1]; 1.0 = max kept frequency after intensity rounding.")
+    p.add_argument("--include-max-freq", action="store_true",
+                   help="When used with --normalize-freq, include the unnormalized max frequency (that maps to 1.0) in the prompt.")
     p.add_argument("--niggli-reduce", action="store_true",
                    help="Apply Niggli reduction to each cell before partitioning into train/test.")
     args = p.parse_args()
@@ -193,6 +221,7 @@ def main():
             activity_decimals=args.activity_decimals,
             normalize_freq=args.normalize_freq,
             niggli=args.niggli_reduce,
+            include_max_freq=args.include_max_freq,
         )
         if rec is not None:
             records.append(rec)

From 69c3463c4f141a11453129c92cc098b89b92e12e Mon Sep 17 00:00:00 2001
From: crhysc <crc00042@mix.wvu.edu>
Date: Thu, 13 Nov 2025 13:56:05 -0500
Subject: [PATCH 13/50] initial commit

---
 atomgpt/inverse_models/gpt_oss.py | 154 ++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 atomgpt/inverse_models/gpt_oss.py

diff --git a/atomgpt/inverse_models/gpt_oss.py b/atomgpt/inverse_models/gpt_oss.py
new file mode 100644
index 0000000..dc6a395
--- /dev/null
+++ b/atomgpt/inverse_models/gpt_oss.py
@@ -0,0 +1,154 @@
+from atomgpt.inverse_models.llama import *  # noqa: F401,F403
+import os
+
+from atomgpt.inverse_models._utils import __version__  # noqa: F401
+from atomgpt.inverse_models._utils2 import Version, _get_dtype  # noqa: F401
+
+try:
+    # New HF GPT-OSS modeling API
+    from transformers.models.gpt_oss.modeling_gpt_oss import (
+        GptOssModel,
+        GptOssForCausalLM,
+    )
+except Exception as exc:  # pragma: no cover
+    raise ImportError(
+        "AtomGPT: transformers installation does not appear to include "
+        "the `gpt_oss` model. Please upgrade transformers:\n"
+        '  pip install --upgrade "transformers"\n'
+        "and ensure you are on a release that supports GPT-OSS."
+    ) from exc
+
+
+# Convenience list of all 4 Unsloth GPT-OSS models that are supported via
+# FastLanguageModel.from_pretrained(..., model_name=...).
+#
+# You can use these as drop-in `model_name` values:
+#
+#   from atomgpt.inverse_models.gpt_oss import UNSLOTH_GPT_OSS_MODELS
+#   model_name = UNSLOTH_GPT_OSS_MODELS[0]
+#
+UNSLOTH_GPT_OSS_MODELS = [
+    # BitsAndBytes 4bit Unsloth quantizations
+    "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
+    "unsloth/gpt-oss-120b-unsloth-bnb-4bit",
+    # MXFP4 “original” weights that Unsloth wraps
+    "unsloth/gpt-oss-20b",
+    "unsloth/gpt-oss-120b",
+]
+
+
+def _log_once(msg: str) -> None:
+    """Tiny helper to avoid spamming logs if imported multiple times."""
+    if getattr(_log_once, "_seen", None) is None:
+        _log_once._seen = set()
+    if msg in _log_once._seen:
+        return
+    _log_once._seen.add(msg)
+    print(msg)
+
+
+class FastGptOssModel(FastLlamaModel):
+    """
+    Fast GPT-OSS integration for AtomGPT.
+
+    This mirrors the overall structure of `FastMistralModel` but takes a more
+    conservative approach:
+
+    * We **do not** override GPT-OSS attention / MoE internals. Those are
+      handled by the upstream `transformers` implementation and whatever
+      `unsloth_compile_transformers` is already doing in your loader.
+    * We **do**:
+        - patch PEFT `PeftModelForCausalLM.forward` to the same fast path
+          that LLaMA / Mistral use.
+        - patch `prepare_inputs_for_generation` for `GptOssForCausalLM`, so
+          that your generation path stays consistent with LLaMA / Mistral.
+    * Everything else is delegated to `FastLlamaModel.from_pretrained` with
+      `model_patcher=FastGptOssModel`, to keep the hierarchy uniform.
+    """
+
+    @staticmethod
+    def pre_patch():
+        """
+        Apply GPT-OSS-specific patches.
+
+        We deliberately do **not** touch GPT-OSS attention / decoder layer
+        implementations here, to avoid shape / MoE wiring mistakes. Instead we
+        reuse only the architecture-agnostic bits from `llama.py`.
+        """
+        # Reuse the PEFT fast forward path (architecture-agnostic: it only
+        # assumes a CausalLM head with `.lm_head`).
+        global PeftModelForCausalLM  # imported from llama.py via *
+        PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
+
+        # Reuse the generation input patcher so GPT-OSS works with your
+        # custom `prepare_inputs_for_generation` handling (past-key-values,
+        # sliding window, etc.), analogous to Mistral / LLaMA.
+        fix_prepare_inputs_for_generation(GptOssForCausalLM)
+
+        _log_once(
+            "AtomGPT: Patched GPT-OSS (GptOssForCausalLM + PEFT) for "
+            "FastLanguageModel integration."
+        )
+        return
+
+    @staticmethod
+    def from_pretrained(
+        model_name: str = "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
+        max_seq_length: int | None = None,
+        dtype=None,
+        load_in_4bit: bool = True,
+        token=None,
+        device_map: str | dict = "sequential",
+        rope_scaling=None,  # GPT-OSS does not use classic RoPE scaling, kept for API symmetry
+        fix_tokenizer: bool = True,
+        model_patcher=None,
+        tokenizer_name: str | None = None,
+        trust_remote_code: bool = False,
+        **kwargs,
+    ):
+        """
+        Thin wrapper around `FastLlamaModel.from_pretrained`.
+
+        The important part is that we pass `model_patcher=FastGptOssModel`,
+        which causes:
+
+          * `FastGptOssModel.pre_patch()` to run before loading.
+          * All the Unsloth / AtomGPT compile + quantization machinery to be
+            reused exactly as for LLaMA / Mistral.
+
+        Usage (drop-in with your loader):
+
+            from atomgpt.inverse_models.loader import FastLanguageModel
+
+            model, tokenizer = FastLanguageModel.from_pretrained(
+                model_name="unsloth/gpt-oss-20b-unsloth-bnb-4bit",
+                max_seq_length=2048,
+                dtype=None,
+                load_in_4bit=True,
+            )
+        """
+        # Defer to the LLaMA machinery – it will:
+        #   * call FastGptOssModel.pre_patch()
+        #   * run unsloth_compile_transformers
+        #   * handle bitsandbytes / 4bit / 8bit / PEFT, etc.
+        return FastLlamaModel.from_pretrained(
+            model_name=model_name,
+            max_seq_length=max_seq_length,
+            dtype=dtype,
+            load_in_4bit=load_in_4bit,
+            token=token,
+            device_map=device_map,
+            rope_scaling=rope_scaling,
+            fix_tokenizer=fix_tokenizer,
+            model_patcher=FastGptOssModel if model_patcher is None else model_patcher,
+            tokenizer_name=tokenizer_name,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+
+
+__all__ = [
+    "FastGptOssModel",
+    "UNSLOTH_GPT_OSS_MODELS",
+]
+

From 1d151572bc7c5248759bb1079ed1d562418e1736 Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Thu, 13 Nov 2025 14:02:14 -0500
Subject: [PATCH 14/50] Update loader.py

---
 atomgpt/inverse_models/loader.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/atomgpt/inverse_models/loader.py b/atomgpt/inverse_models/loader.py
index 2a66eeb..e473adc 100644
--- a/atomgpt/inverse_models/loader.py
+++ b/atomgpt/inverse_models/loader.py
@@ -6,6 +6,7 @@
     USE_MODELSCOPE,
     get_transformers_model_type,
 )
+from atomgpt.inverse_models.gpt_oss import FastGptOssModel
 from atomgpt.inverse_models.granite import FastGraniteModel
 from atomgpt.inverse_models.llama import FastLlamaModel, logger
 from atomgpt.inverse_models.mistral import FastMistralModel
@@ -44,6 +45,7 @@
 SUPPORTS_GRANITE = transformers_version >= Version("4.46.0")
 SUPPORTS_QWEN3 = transformers_version >= Version("4.50.3")
 SUPPORTS_QWEN3_MOE = transformers_version >= Version("4.50.3")
+SUPPORTS_GPT_OSS = transformers_version >= Version("4.55.0")
 if SUPPORTS_GEMMA:
     from atomgpt.inverse_models.gemma import FastGemmaModel
 if SUPPORTS_GEMMA2:
@@ -294,6 +296,17 @@ def from_pretrained(
                     f"to obtain the latest transformers build, then restart this session."
                 )
             dispatch_model = FastGemmaModel
+        elif model_type == "gpt_oss":
+            if not SUPPORTS_GPT_OSS:
+                raise ImportError(
+                    f"AtomGPT: Your transformers version of {transformers_version} "
+                    f"does not support GPT-OSS.\n"
+                    f"The minimum required version is 4.55.0.\n"
+                    f'Try `pip install --upgrade "transformers>=4.55.0"`\n'
+                    f"to obtain the latest compatible transformers build, then "
+                    f"restart this session."
+                )
+            dispatch_model = FastGptOssModel
         elif model_type == "gemma2":
             if not SUPPORTS_GEMMA2:
                 raise ImportError(

From 8ff5caee7d75264d511ddeaffd8ba0752f1248e4 Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Wed, 19 Nov 2025 15:03:22 -0500
Subject: [PATCH 15/50] patch if load_in_4bit

---
 atomgpt/inverse_models/loader.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/atomgpt/inverse_models/loader.py b/atomgpt/inverse_models/loader.py
index e473adc..4f28327 100644
--- a/atomgpt/inverse_models/loader.py
+++ b/atomgpt/inverse_models/loader.py
@@ -918,14 +918,27 @@ def from_pretrained(
                 ]
             )
         pass
-
+        
         if load_in_4bit:
-            # Fix up bitsandbytes config
+            # Fix up bitsandbytes config, robust to missing torch_dtype/dtype.
+            cfg_dict = model.config.to_dict()
+
+            compute_dtype = cfg_dict.get("torch_dtype", None)
+            if compute_dtype is None:
+                # Newer configs may use "dtype" instead
+                compute_dtype = cfg_dict.get("dtype", None)
+
+            # Fall back to the user-specified dtype or a sensible default
+            if compute_dtype is None:
+                compute_dtype = _get_dtype(dtype)  # imported above
+
+            # Sometimes this is a string like "float16" – map to torch dtype
+            import torch
+            if isinstance(compute_dtype, str):
+                compute_dtype = getattr(torch, compute_dtype, torch.float16)
+
             quantization_config = {
-                # Sometimes torch_dtype is not a string!!
-                "bnb_4bit_compute_dtype": model.config.to_dict()[
-                    "torch_dtype"
-                ],
+                "bnb_4bit_compute_dtype": compute_dtype,
                 "bnb_4bit_quant_type": "nf4",
                 "bnb_4bit_use_double_quant": True,
                 "llm_int8_enable_fp32_cpu_offload": False,

From 1cb88ef0acdae0985c42b3d69c18c062c5d5c1c0 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Wed, 19 Nov 2025 16:42:46 -0500
Subject: [PATCH 16/50] upgrade the required transformers version to 4.57.1

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 33985ee..7249477 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -104,7 +104,7 @@ toolz==1.0.0
 torch==2.7.0
 torchvision==0.22.0
 tqdm==4.67.1
-transformers==4.51.3
+transformers==4.57.1
 triton==3.3.0
 trl==0.15.2
 typeguard==4.4.2

From 69226ee0c0be20e1898cdfb93d0195f70beef6e5 Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Wed, 19 Nov 2025 17:14:22 -0500
Subject: [PATCH 17/50] add _get_dtype(dtype) to line 466

---
 atomgpt/inverse_models/loader.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/atomgpt/inverse_models/loader.py b/atomgpt/inverse_models/loader.py
index 4f28327..0a501ee 100644
--- a/atomgpt/inverse_models/loader.py
+++ b/atomgpt/inverse_models/loader.py
@@ -461,12 +461,12 @@ def from_pretrained(
         pass
 
         if load_in_4bit:
-            # Fix up bitsandbytes config
+            # Fix up bitsandbytes config, robust to missing torch_dtype.
+            # Use the same helper we use elsewhere.
+            compute_dtype = _get_dtype(dtype)  # falls back to bf16/fp16 based on hardware
+
             quantization_config = {
-                # Sometimes torch_dtype is not a string!!
-                "bnb_4bit_compute_dtype": model.config.to_dict()[
-                    "torch_dtype"
-                ],
+                "bnb_4bit_compute_dtype": compute_dtype,
                 "bnb_4bit_quant_type": "nf4",
                 "bnb_4bit_use_double_quant": True,
                 "llm_int8_enable_fp32_cpu_offload": False,
@@ -478,6 +478,7 @@ def from_pretrained(
                 "quant_method": "bitsandbytes",
             }
             model.config.update({"quantization_config": quantization_config})
+
         pass
 
         if is_peft:

From f6efda5cde7a0379c21c41381412bd57cf54c7b6 Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Wed, 19 Nov 2025 17:36:55 -0500
Subject: [PATCH 18/50] add gpt_oss to def patch_peft_model()

---
 atomgpt/inverse_models/llama.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/atomgpt/inverse_models/llama.py b/atomgpt/inverse_models/llama.py
index f647a75..053f15e 100644
--- a/atomgpt/inverse_models/llama.py
+++ b/atomgpt/inverse_models/llama.py
@@ -3141,6 +3141,13 @@ def patch_peft_model(
             apply_lora_mlp = apply_lora_mlp_swiglu
         elif model_type == "qwen3moe":
             apply_lora_mlp = apply_lora_mlp_swiglu
+        elif model_type == "gpt_oss":
+            if use_gradient_checkpointing == "unsloth":
+                try:
+                    model.gradient_checkpointing_enable()
+                except Exception:
+                    pass
+            return model
         else:
             raise NotImplementedError(
                 f"AtomGPT: {model_type} is not yet implemented!"

From f1bab0ef9238bd60f8b854abf69d9e392544a567 Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Wed, 19 Nov 2025 17:58:53 -0500
Subject: [PATCH 19/50] patch num_logits_to_keep

---
 atomgpt/inverse_models/llama.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/atomgpt/inverse_models/llama.py b/atomgpt/inverse_models/llama.py
index 053f15e..6f93e28 100644
--- a/atomgpt/inverse_models/llama.py
+++ b/atomgpt/inverse_models/llama.py
@@ -2012,11 +2012,28 @@ def unsloth_fast_generate(
 
     # For newer HF
     kwargs["cache_implementation"] = "dynamic"
-    # For num_logits_to_keep
-    num_logits_to_keep = kwargs.get("num_logits_to_keep", None)
-    logits_to_keep = kwargs.get("logits_to_keep", None)
-    if num_logits_to_keep is None and logits_to_keep is None:
-        kwargs["num_logits_to_keep"] = 1
+    
+    # For num_logits_to_keep: only use it if the model forward actually supports it
+    import inspect
+
+    try:
+        forward_sig = inspect.signature(self.forward)
+        supports_num_logits = (
+            "num_logits_to_keep" in forward_sig.parameters
+            or "logits_to_keep" in forward_sig.parameters
+        )
+    except (TypeError, ValueError):
+        supports_num_logits = False
+
+    if supports_num_logits:
+        num_logits_to_keep = kwargs.get("num_logits_to_keep", None)
+        logits_to_keep = kwargs.get("logits_to_keep", None)
+        if num_logits_to_keep is None and logits_to_keep is None:
+            kwargs["num_logits_to_keep"] = 1
+    else:
+        # Make sure we don't pass these through to HF generate for models that don't support them
+        kwargs.pop("num_logits_to_keep", None)
+        kwargs.pop("logits_to_keep", None)
 
     # Remove token_type_ids
     kwargs.pop("token_type_ids", None)

From 7d3d37b2ab39820553f1809883e7bc37fcae6fff Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Thu, 20 Nov 2025 18:01:50 -0500
Subject: [PATCH 20/50] strip num_logits_to_keep in unsloth_fast_generate() for
 gpt-oss models

---
 atomgpt/inverse_models/llama.py | 44 +++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/atomgpt/inverse_models/llama.py b/atomgpt/inverse_models/llama.py
index 6f93e28..6f29d0a 100644
--- a/atomgpt/inverse_models/llama.py
+++ b/atomgpt/inverse_models/llama.py
@@ -2013,27 +2013,39 @@ def unsloth_fast_generate(
     # For newer HF
     kwargs["cache_implementation"] = "dynamic"
     
-    # For num_logits_to_keep: only use it if the model forward actually supports it
+    # --- Handle num_logits_to_keep / logits_to_keep safely per model type ---
     import inspect
 
-    try:
-        forward_sig = inspect.signature(self.forward)
-        supports_num_logits = (
-            "num_logits_to_keep" in forward_sig.parameters
-            or "logits_to_keep" in forward_sig.parameters
-        )
-    except (TypeError, ValueError):
-        supports_num_logits = False
+    model_type = getattr(getattr(self, "config", None), "model_type", None)
 
-    if supports_num_logits:
-        num_logits_to_keep = kwargs.get("num_logits_to_keep", None)
-        logits_to_keep = kwargs.get("logits_to_keep", None)
-        if num_logits_to_keep is None and logits_to_keep is None:
-            kwargs["num_logits_to_keep"] = 1
-    else:
-        # Make sure we don't pass these through to HF generate for models that don't support them
+    # GPT-OSS does *not* advertise these kwargs in its generate/forward stack, and
+    # passing them causes HF `generate` → `_validate_model_kwargs` to raise:
+    #   ValueError: The following `model_kwargs` are not used by the model: ['num_logits_to_keep']
+    # So for GPT-OSS we always strip them.
+    if model_type == "gpt_oss":
         kwargs.pop("num_logits_to_keep", None)
         kwargs.pop("logits_to_keep", None)
+    else:
+        # For other models (llama, mistral, etc.), keep Unsloth's optimization:
+        # only use num_logits_to_keep/logits_to_keep if the model forward supports them.
+        try:
+            forward_sig = inspect.signature(self.forward)
+            supports_num_logits = (
+                "num_logits_to_keep" in forward_sig.parameters
+                or "logits_to_keep" in forward_sig.parameters
+            )
+        except (TypeError, ValueError):
+            supports_num_logits = False
+
+        if supports_num_logits:
+            num_logits_to_keep = kwargs.get("num_logits_to_keep", None)
+            logits_to_keep = kwargs.get("logits_to_keep", None)
+            if num_logits_to_keep is None and logits_to_keep is None:
+                # Enable Unsloth's memory optimization for compatible models
+                kwargs["num_logits_to_keep"] = 1
+        else:
+            kwargs.pop("num_logits_to_keep", None)
+            kwargs.pop("logits_to_keep", None)
 
     # Remove token_type_ids
     kwargs.pop("token_type_ids", None)

From c02d6a5f66d527e7b5dc7c1edab050a31a76ecdb Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Thu, 20 Nov 2025 18:50:45 -0500
Subject: [PATCH 21/50] Update gpt_oss.py

---
 atomgpt/inverse_models/gpt_oss.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/atomgpt/inverse_models/gpt_oss.py b/atomgpt/inverse_models/gpt_oss.py
index dc6a395..67e29a0 100644
--- a/atomgpt/inverse_models/gpt_oss.py
+++ b/atomgpt/inverse_models/gpt_oss.py
@@ -18,6 +18,32 @@
         "and ensure you are on a release that supports GPT-OSS."
     ) from exc
 
+# --- AtomGPT: fix GPT-OSS position_ids shape for rotary embeddings ---
+# Some fast-generation paths may end up passing a 1D tensor for `position_ids`
+# (shape [seq_len]), but GPT-OSS's rotary embeddings expect [batch, seq_len].
+# This wrapper upgrades 1D position_ids → [1, seq_len] to avoid IndexError.
+
+if not hasattr(GptOssModel, "_atomgpt_position_ids_patched"):
+    _original_gpt_oss_forward = GptOssModel.forward
+
+    def _atomgpt_gpt_oss_forward(self, *args, **kwargs):
+        pos = kwargs.get("position_ids", None)
+        try:
+            if pos is not None and hasattr(pos, "dim") and pos.dim() == 1:
+                # [seq_len] -> [1, seq_len]
+                kwargs["position_ids"] = pos.unsqueeze(0)
+        except Exception:
+            # Best-effort: never let our fix be the thing that breaks.
+            pass
+        return _original_gpt_oss_forward(self, *args, **kwargs)
+
+    GptOssModel.forward = _atomgpt_gpt_oss_forward
+    GptOssModel._atomgpt_position_ids_patched = True
+
+    print(
+        "AtomGPT: Patched GptOssModel.forward to fix 1D position_ids for GPT-OSS rotary embeddings."
+    )
+
 
 # Convenience list of all 4 Unsloth GPT-OSS models that are supported via
 # FastLanguageModel.from_pretrained(..., model_name=...).

From 078175d4347d2f60e61c248a734fef55db49ef2b Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Thu, 20 Nov 2025 18:57:32 -0500
Subject: [PATCH 22/50] patch pre_patch()

---
 atomgpt/inverse_models/gpt_oss.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/atomgpt/inverse_models/gpt_oss.py b/atomgpt/inverse_models/gpt_oss.py
index 67e29a0..3355093 100644
--- a/atomgpt/inverse_models/gpt_oss.py
+++ b/atomgpt/inverse_models/gpt_oss.py
@@ -86,8 +86,9 @@ class FastGptOssModel(FastLlamaModel):
     * We **do**:
         - patch PEFT `PeftModelForCausalLM.forward` to the same fast path
           that LLaMA / Mistral use.
-        - patch `prepare_inputs_for_generation` for `GptOssForCausalLM`, so
-          that your generation path stays consistent with LLaMA / Mistral.
+        - (for now) leave `GptOssForCausalLM.prepare_inputs_for_generation`
+          untouched, because the LLaMA-style patch breaks GPT-OSS attention
+          shapes during sampling.
     * Everything else is delegated to `FastLlamaModel.from_pretrained` with
       `model_patcher=FastGptOssModel`, to keep the hierarchy uniform.
     """
@@ -106,17 +107,23 @@ def pre_patch():
         global PeftModelForCausalLM  # imported from llama.py via *
         PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
 
-        # Reuse the generation input patcher so GPT-OSS works with your
-        # custom `prepare_inputs_for_generation` handling (past-key-values,
-        # sliding window, etc.), analogous to Mistral / LLaMA.
-        fix_prepare_inputs_for_generation(GptOssForCausalLM)
+        # IMPORTANT:
+        # Do NOT call `fix_prepare_inputs_for_generation(GptOssForCausalLM)`
+        # here. That patch is tailored to LLaMA/Mistral KV-cache semantics and
+        # causes attention shape mismatches for GPT-OSS (e.g. value_states
+        # ending up with seq_len = 1 instead of the full context length).
+        #
+        # We'll rely on the official transformers implementation of
+        # `prepare_inputs_for_generation` for GPT-OSS instead.
+        # fix_prepare_inputs_for_generation(GptOssForCausalLM)
 
         _log_once(
-            "AtomGPT: Patched GPT-OSS (GptOssForCausalLM + PEFT) for "
-            "FastLanguageModel integration."
+            "AtomGPT: Patched GPT-OSS (PEFT fast forward only; "
+            "using native prepare_inputs_for_generation)."
         )
         return
 
+
     @staticmethod
     def from_pretrained(
         model_name: str = "unsloth/gpt-oss-20b-unsloth-bnb-4bit",

From 1be754cfda497192d42d0889f340d0c2a87cd5e3 Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Thu, 20 Nov 2025 19:19:35 -0500
Subject: [PATCH 23/50] force progress bar

---
 atomgpt/inverse_models/inverse_models.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index 9744286..753c303 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -593,7 +593,9 @@ def tokenize_function(example):
             num_train_epochs=config.num_epochs,
             save_strategy=config.save_strategy,
             save_steps=config.save_steps,
-        ),
+            disable_tqdm=False,
+            log_level="info", 
+            ),
     )
     if callback_samples > 0:
         callback = ExampleTrainerCallback(

From 85d2f82768b12c3105c74876ec18f5e76f70c581 Mon Sep 17 00:00:00 2001
From: crhysc <crc00042@mix.wvu.edu>
Date: Thu, 2 Oct 2025 16:24:41 -0400
Subject: [PATCH 24/50] add invalid structures error handling

---
 atomgpt/inverse_models/inverse_models.py | 117 +++++++++++++++++------
 1 file changed, 86 insertions(+), 31 deletions(-)

diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index 753c303..b4cc788 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -27,6 +27,7 @@
 from jarvis.io.vasp.inputs import Poscar
 import csv
 import os
+import numpy as np
 from pydantic_settings import BaseSettings
 import sys
 import json
@@ -211,42 +212,96 @@ def load_model(path="", config=None):
     FastLanguageModel.for_inference(model)
     return model, tokenizer, config
 
+def _validate_atoms(atoms):
+    if atoms is None:
+        return False, "atoms_is_none"
+    try:
+        lat = np.asarray(getattr(atoms, "lattice_mat", None), dtype=float)
+        if lat.shape != (3, 3):
+            return False, f"bad_lattice_shape:{getattr(atoms,'lattice_mat',None)}"
+        if not np.isfinite(lat).all():
+            return False, "nonfinite_lattice"
+        n = getattr(atoms, "num_atoms", None)
+        if n is None or n <= 0:
+            return False, f"num_atoms_invalid:{n}"
+        _ = Poscar(atoms).to_string()
+        return True, ""
+    except Exception as e:
+        return False, f"poscar_fail:{type(e).__name__}:{e}"
+
+def _poscar_one_line(at):
+    return Poscar(at).to_string().replace("\n", "\\n")
+
+def _misses_path(csv_out, config):
+    fname = getattr(config, "miss_csv", None)
+    if fname is None or not str(fname).strip():
+        root, ext = os.path.splitext(csv_out)
+        fname = root + ".misses.csv"
+    os.makedirs(os.path.dirname(os.path.abspath(fname)), exist_ok=True)
+    return fname
 
 def evaluate(
-    test_set=[], model="", tokenizer="", csv_out="out.csv", config=""
+    test_set=[],
+    model="",
+    tokenizer="",
+    csv_out="out.csv",
+    config="",
 ):
     print("Testing\n", len(test_set))
-    f = open(csv_out, "w")
-    f.write("id,target,prediction\n")
+    os.makedirs(os.path.dirname(os.path.abspath(csv_out)), exist_ok=True)
+    miss_csv_out = _misses_path(csv_out, config)
+
+    with open(csv_out, "w", newline="") as f_ok, open(miss_csv_out, "w", newline="") as f_miss:
+        ok_writer = csv.writer(f_ok)
+        miss_writer = csv.writer(f_miss)
+        ok_writer.writerow(["id", "target", "prediction"])
+        miss_writer.writerow(["id", "stage", "error", "detail", "raw_text_preview"])
+
+        for i in tqdm(test_set, total=len(test_set)):
+            sample_id = i.get("id", "")
+            target_mat = None
+            target_err = None
+            try:
+                target_mat = text2atoms("\n" + i["output"])
+                ok, detail = _validate_atoms(target_mat)
+                if not ok:
+                    target_err = detail
+            except Exception as e:
+                target_err = f"text2atoms:{type(e).__name__}:{e}"
+
+            if target_err:
+                miss_writer.writerow([sample_id, "target", "invalid_target", target_err, (i.get("output","")[:240])])
+                continue
+
+            gen_mat = None
+            gen_err = None
+            try:
+                gen_mat = gen_atoms(
+                    prompt=i["input"],
+                    tokenizer=tokenizer,
+                    model=model,
+                    alpaca_prompt=config.alpaca_prompt,
+                    instruction=config.instruction,
+                )
+                ok, detail = _validate_atoms(gen_mat)
+                if not ok:
+                    gen_err = detail
+            except Exception as e:
+                gen_err = f"gen_atoms:{type(e).__name__}:{e}"
+
+            if gen_err:
+                miss_writer.writerow([sample_id, "prediction", "invalid_prediction", gen_err, ""])
+                continue
+
+            try:
+                ok_writer.writerow([
+                    sample_id,
+                    _poscar_one_line(target_mat),
+                    _poscar_one_line(gen_mat),
+                ])
+            except Exception as e:
+                miss_writer.writerow([sample_id, "write", "write_failed", f"{type(e).__name__}:{e}", ""])
 
-    for i in tqdm(test_set, total=len(test_set)):
-        # try:
-        # prompt = i["input"]
-        # print("prompt", prompt)
-        gen_mat = gen_atoms(
-            prompt=i["input"],
-            tokenizer=tokenizer,
-            model=model,
-            alpaca_prompt=config.alpaca_prompt,
-            instruction=config.instruction,
-        )
-        target_mat = text2atoms("\n" + i["output"])
-        print("target_mat", target_mat)
-        print("genmat", gen_mat)
-        line = (
-            i["id"]
-            + ","
-            + Poscar(target_mat).to_string().replace("\n", "\\n")
-            + ","
-            + Poscar(gen_mat).to_string().replace("\n", "\\n")
-            + "\n"
-        )
-        f.write(line)
-        # print()
-    # except Exception as exp:
-    #    print("Error", exp)
-    #    pass
-    f.close()
 
 
 def batch_evaluate(

From 9552f4ea25f9276744c5676338547b94f5c46b5c Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Tue, 18 Nov 2025 10:40:57 -0500
Subject: [PATCH 25/50] add error handling for inverse_predict.py

---
 atomgpt/inverse_models/inverse_predict.py | 111 ++++++++++++++--------
 1 file changed, 71 insertions(+), 40 deletions(-)

diff --git a/atomgpt/inverse_models/inverse_predict.py b/atomgpt/inverse_models/inverse_predict.py
index 378ca35..003cc08 100644
--- a/atomgpt/inverse_models/inverse_predict.py
+++ b/atomgpt/inverse_models/inverse_predict.py
@@ -111,12 +111,9 @@ def predict(
     prop_val=None,
     dtype=None,
     max_seq_length=1058,
-    load_in_4bit=None,  # temp_config["load_in_4bit"]
-    verbose=True,  # temp_config["load_in_4bit"]
+    load_in_4bit=None,
+    verbose=True,
 ):
-    # if not os.path.exists("config_name"):
-
-    #    config_name=os.path.join(output_dir,"config.json")
     print("config_path", config_path)
     if output_dir is not None:
         config_name = os.path.join(output_dir, "config.json")
@@ -125,7 +122,7 @@ def predict(
             config_name = os.path.join(parent, "config.json")
         adapter = os.path.join(output_dir, "adapter_config.json")
         if os.path.exists(adapter):
-            model_name = output_dir  # temp_config["model_name"]
+            model_name = output_dir
     if config_path is not None:
         config_name = config_path
         if verbose:
@@ -142,7 +139,6 @@ def predict(
         pprint.pprint(temp_config)
     if model_name is None:
         model_name = temp_config["model_name"]
-    # output_dir = temp_config["output_dir"]
     if load_in_4bit is None:
         load_in_4bit = temp_config["load_in_4bit"]
 
@@ -150,6 +146,7 @@ def predict(
         print("Model used:", model_name)
         print("config used:", config_path)
         print("formula:", formula)
+
     model = None
     tokenizer = None
     try:
@@ -161,29 +158,28 @@ def predict(
             device_map="auto",
         )
         FastLanguageModel.for_inference(model)
-    except:
+    except Exception:
         tokenizer = AutoTokenizer.from_pretrained(
             model_name, gguf_file=filename
         )
         model = AutoModelForCausalLM.from_pretrained(
             model_name, gguf_file=filename
         )
-        pass
+
     atoms_arr = []
     lines = []
     if formula is None:
-        # if dat_path is None:
-        f = open(pred_csv, "r")
-        lines = f.read().splitlines()
-        f.close()
+        with open(pred_csv, "r") as f:
+            lines = f.read().splitlines()
     else:
         if dat_path is not None:
             lines = [dat_path]
-        lines = [formula]
+        else:
+            lines = [formula]
 
     mem = []
 
-    for i in lines:
+    for idx, i in enumerate(lines):
         prompt = i
         if ".dat" in i or dat_path is not None:
             if dat_path is None:
@@ -198,21 +194,18 @@ def predict(
                 formula=formula,
                 background_subs=background_subs,
             )
-            # y[y < 0.1] = 0
-            y_new_str = y  # "\n".join(["{0:.2f}".format(x) for x in y])
+            y_new_str = y
             try:
                 if ".dat" in i:
                     formula = str(_formula.split("/")[-1].split(".dat")[0])
             except Exception:
                 pass
-            # gen_mat = main_spectra(spectra=[[y_new_str,y]],formulas=[formula],model=model,tokenizer=tokenizer,device='cuda')[0]
             prompt = (
                 "The chemical formula is "
                 + formula
                 + " The "
                 + temp_config["prop"]
                 + " is "
-                # + " The XRD is "
                 + y_new_str
                 + ". Generate atomic structure description with lattice lengths, angles, coordinates and atom types."
             )
@@ -224,34 +217,72 @@ def predict(
                     + " The "
                     + temp_config["prop"]
                     + " is "
-                    # + " The XRD is "
                     + str(prop_val)
                     + ". Generate atomic structure description with lattice lengths, angles, coordinates and atom types."
                 )
 
         if verbose:
-            print("prompt here", prompt.replace("\n", ","))
-        gen_mat = gen_atoms(
-            prompt=prompt,
-            model=model,
-            tokenizer=tokenizer,
-            alpaca_prompt=temp_config["alpaca_prompt"],
-            instruction=temp_config["instruction"],
-            device=device,
-        )
-        if verbose:
-            print("gen atoms", gen_mat)
-            print("gen atoms spacegroup", gen_mat.spacegroup())
-            print("intvl", intvl)
-        if relax:
-            gen_mat = relax_atoms(atoms=gen_mat)
+            print(f"[{idx}] prompt:", prompt.replace("\n", ","))
+
+        info = {"prompt": prompt}
+        gen_mat = None
+
+        # --- NEW: robust error handling around generation / structure use ---
+        try:
+            gen_mat = gen_atoms(
+                prompt=prompt,
+                model=model,
+                tokenizer=tokenizer,
+                alpaca_prompt=temp_config["alpaca_prompt"],
+                instruction=temp_config["instruction"],
+                device=device,
+            )
+
             if verbose:
-                print("gen atoms relax", gen_mat, gen_mat.spacegroup())
-        atoms_arr.append(gen_mat.to_dict())
-        info = {}
-        info["prompt"] = prompt
-        info["atoms"] = gen_mat.to_dict()
+                print(f"[{idx}] gen atoms:", gen_mat)
+                # spacegroup() can fail for broken structures, so guard it
+                try:
+                    print(f"[{idx}] gen atoms spacegroup:", gen_mat.spacegroup())
+                except Exception as e_sg:
+                    print(
+                        f"[WARN] Failed to compute spacegroup for sample {idx}: {e_sg}"
+                    )
+
+            if relax:
+                try:
+                    gen_mat = relax_atoms(atoms=gen_mat)
+                    if verbose:
+                        print(
+                            f"[{idx}] gen atoms relax:",
+                            gen_mat,
+                            gen_mat.spacegroup(),
+                        )
+                except Exception as e_relax:
+                    print(
+                        f"[WARN] Relaxation failed for sample {idx}, "
+                        "continuing with unrelaxed structure."
+                    )
+                    print(traceback.format_exc())
+
+            # this is another common crash point if gen_mat is invalid
+            atoms_dict = gen_mat.to_dict()
+            atoms_arr.append(atoms_dict)
+            info["atoms"] = atoms_dict
+
+        except Exception as e:
+            print(
+                f"[ERROR] Failed to generate a valid structure for sample {idx} "
+                f"(input: {i}): {e}"
+            )
+            # optional: print full traceback for debugging
+            print(traceback.format_exc())
+            info["error"] = str(e)
+            # do NOT re-raise; just skip this structure and move on
+            mem.append(info)
+            continue
+
         mem.append(info)
+
     dumpjson(data=mem, filename=fname)
     return model, tokenizer, temp_config
 

From 8902e7099deae5e17f480c2ad546052eac2c3eae Mon Sep 17 00:00:00 2001
From: Charles Campbell <crc00042@mix.wvu.edu>
Date: Wed, 5 Nov 2025 12:50:42 -0500
Subject: [PATCH 26/50] num_proc

---
 atomgpt/inverse_models/inverse_models.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index b4cc788..b9e337c 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -569,10 +569,12 @@ def tokenize_function(example):
     train_dataset = train_dataset.map(
         formatting_prompts_func_with_prompt,
         batched=True,
+        num_proc=config.dataset_num_proc
     )
     eval_dataset = eval_dataset.map(
         formatting_prompts_func_with_prompt,
         batched=True,
+        num_proc=config.dataset_num_proc
     )
     # Compute the actual max sequence length in raw text
     lengths = [
@@ -582,8 +584,8 @@ def tokenize_function(example):
     max_seq_length = max(lengths)
     print(f"🧠 Suggested max_seq_length based on dataset: {max_seq_length}")
 
-    tokenized_train = train_dataset.map(tokenize_function, batched=True)
-    tokenized_eval = eval_dataset.map(tokenize_function, batched=True)
+    tokenized_train = train_dataset.map(tokenize_function, batched=True, num_proc=config.dataset_num_proc)
+    tokenized_eval = eval_dataset.map(tokenize_function, batched=True, num_proc=config.dataset_num_proc)
     tokenized_train.set_format(
         type="torch", columns=["input_ids", "attention_mask", "output"]
     )

From 431e3f91509619c4154099244c7e85232b47c0d8 Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Tue, 18 Nov 2025 12:36:10 -0500
Subject: [PATCH 27/50] if gen_mat = None: ...

---
 atomgpt/inverse_models/inverse_predict.py | 108 ++++++++--------------
 1 file changed, 39 insertions(+), 69 deletions(-)

diff --git a/atomgpt/inverse_models/inverse_predict.py b/atomgpt/inverse_models/inverse_predict.py
index 003cc08..9e36a57 100644
--- a/atomgpt/inverse_models/inverse_predict.py
+++ b/atomgpt/inverse_models/inverse_predict.py
@@ -79,13 +79,10 @@ def relax_atoms(
 
     calculator = AlignnAtomwiseCalculator(path=default_path(), device="cpu")
     t1 = time.time()
-    # if calculator is None:
-    #  return atoms
     ase_atoms = atoms.ase_converter()
     ase_atoms.calc = calculator
 
     ase_atoms = ExpCellFilter(ase_atoms, constant_volume=constant_volume)
-    # TODO: Make it work with any other optimizer
     dyn = FIRE(ase_atoms)
     dyn.run(fmax=fmax, steps=nsteps)
     en = ase_atoms.atoms.get_potential_energy()
@@ -111,8 +108,8 @@ def predict(
     prop_val=None,
     dtype=None,
     max_seq_length=1058,
-    load_in_4bit=None,
-    verbose=True,
+    load_in_4bit=None,  # temp_config["load_in_4bit"]
+    verbose=True,  # temp_config["load_in_4bit"]
 ):
     print("config_path", config_path)
     if output_dir is not None:
@@ -139,6 +136,7 @@ def predict(
         pprint.pprint(temp_config)
     if model_name is None:
         model_name = temp_config["model_name"]
+    # output_dir = temp_config["output_dir"]
     if load_in_4bit is None:
         load_in_4bit = temp_config["load_in_4bit"]
 
@@ -146,7 +144,6 @@ def predict(
         print("Model used:", model_name)
         print("config used:", config_path)
         print("formula:", formula)
-
     model = None
     tokenizer = None
     try:
@@ -158,28 +155,29 @@ def predict(
             device_map="auto",
         )
         FastLanguageModel.for_inference(model)
-    except Exception:
+    except:
         tokenizer = AutoTokenizer.from_pretrained(
             model_name, gguf_file=filename
         )
         model = AutoModelForCausalLM.from_pretrained(
             model_name, gguf_file=filename
         )
-
+        pass
     atoms_arr = []
     lines = []
     if formula is None:
-        with open(pred_csv, "r") as f:
-            lines = f.read().splitlines()
+        # if dat_path is None:
+        f = open(pred_csv, "r")
+        lines = f.read().splitlines()
+        f.close()
     else:
         if dat_path is not None:
             lines = [dat_path]
-        else:
-            lines = [formula]
+        lines = [formula]
 
     mem = []
 
-    for idx, i in enumerate(lines):
+    for i in lines:
         prompt = i
         if ".dat" in i or dat_path is not None:
             if dat_path is None:
@@ -222,74 +220,47 @@ def predict(
                 )
 
         if verbose:
-            print(f"[{idx}] prompt:", prompt.replace("\n", ","))
-
-        info = {"prompt": prompt}
-        gen_mat = None
+            print("prompt here", prompt.replace("\n", ","))
 
-        # --- NEW: robust error handling around generation / structure use ---
-        try:
-            gen_mat = gen_atoms(
-                prompt=prompt,
-                model=model,
-                tokenizer=tokenizer,
-                alpaca_prompt=temp_config["alpaca_prompt"],
-                instruction=temp_config["instruction"],
-                device=device,
-            )
-
-            if verbose:
-                print(f"[{idx}] gen atoms:", gen_mat)
-                # spacegroup() can fail for broken structures, so guard it
-                try:
-                    print(f"[{idx}] gen atoms spacegroup:", gen_mat.spacegroup())
-                except Exception as e_sg:
-                    print(
-                        f"[WARN] Failed to compute spacegroup for sample {idx}: {e_sg}"
-                    )
-
-            if relax:
-                try:
-                    gen_mat = relax_atoms(atoms=gen_mat)
-                    if verbose:
-                        print(
-                            f"[{idx}] gen atoms relax:",
-                            gen_mat,
-                            gen_mat.spacegroup(),
-                        )
-                except Exception as e_relax:
-                    print(
-                        f"[WARN] Relaxation failed for sample {idx}, "
-                        "continuing with unrelaxed structure."
-                    )
-                    print(traceback.format_exc())
-
-            # this is another common crash point if gen_mat is invalid
-            atoms_dict = gen_mat.to_dict()
-            atoms_arr.append(atoms_dict)
-            info["atoms"] = atoms_dict
+        gen_mat = gen_atoms(
+            prompt=prompt,
+            model=model,
+            tokenizer=tokenizer,
+            alpaca_prompt=temp_config["alpaca_prompt"],
+            instruction=temp_config["instruction"],
+            device=device,
+        )
 
-        except Exception as e:
+        if gen_mat is None:
             print(
-                f"[ERROR] Failed to generate a valid structure for sample {idx} "
-                f"(input: {i}): {e}"
+                "The returned structure is invalid. Here is the output:",
+                gen_mat,
             )
-            # optional: print full traceback for debugging
-            print(traceback.format_exc())
-            info["error"] = str(e)
-            # do NOT re-raise; just skip this structure and move on
+            info = {}
+            info["prompt"] = prompt
+            info["error"] = "Invalid structure returned by AtomGPT (None)."
             mem.append(info)
+            # skip the rest of the loop for this entry
             continue
 
+        if verbose:
+            print("gen atoms", gen_mat)
+            print("gen atoms spacegroup", gen_mat.spacegroup())
+            print("intvl", intvl)
+        if relax:
+            gen_mat = relax_atoms(atoms=gen_mat)
+            if verbose:
+                print("gen atoms relax", gen_mat, gen_mat.spacegroup())
+        atoms_arr.append(gen_mat.to_dict())
+        info = {}
+        info["prompt"] = prompt
+        info["atoms"] = gen_mat.to_dict()
         mem.append(info)
-
     dumpjson(data=mem, filename=fname)
     return model, tokenizer, temp_config
 
 
 if __name__ == "__main__":
-    # output_dir = make_id_prop()
-    # output_dir="."
     args = parser.parse_args(sys.argv[1:])
     print("args.config_path", args.config_path)
     predict(
@@ -302,5 +273,4 @@ def predict(
         config_path=args.config_path,
         prop_val=args.prop_val,
         background_subs=args.background_subs,
-        # config_name=args.config_name,
     )

From 3bbd9d2293d13bdafb95831a2e4482a8c669de62 Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Tue, 18 Nov 2025 12:42:01 -0500
Subject: [PATCH 28/50] rm "Here is the output"

---
 atomgpt/inverse_models/inverse_predict.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/atomgpt/inverse_models/inverse_predict.py b/atomgpt/inverse_models/inverse_predict.py
index 9e36a57..109e3f9 100644
--- a/atomgpt/inverse_models/inverse_predict.py
+++ b/atomgpt/inverse_models/inverse_predict.py
@@ -233,8 +233,7 @@ def predict(
 
         if gen_mat is None:
             print(
-                "The returned structure is invalid. Here is the output:",
-                gen_mat,
+                "The structure returned by gen_mat() is not a valid crystal structure.
             )
             info = {}
             info["prompt"] = prompt

From e3d22e18eb0b773bf785cea0c86efdf3f96834ae Mon Sep 17 00:00:00 2001
From: "C. Rhys Campbell" <149001340+crhysc@users.noreply.github.com>
Date: Tue, 18 Nov 2025 12:43:10 -0500
Subject: [PATCH 29/50] terminate string literal

---
 atomgpt/inverse_models/inverse_predict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/atomgpt/inverse_models/inverse_predict.py b/atomgpt/inverse_models/inverse_predict.py
index 109e3f9..63054cf 100644
--- a/atomgpt/inverse_models/inverse_predict.py
+++ b/atomgpt/inverse_models/inverse_predict.py
@@ -233,7 +233,7 @@ def predict(
 
         if gen_mat is None:
             print(
-                "The structure returned by gen_mat() is not a valid crystal structure.
+                "The structure returned by gen_mat() is not a valid crystal structure."
             )
             info = {}
             info["prompt"] = prompt

From d9c94afa78f476875af154734e8deef91998d041 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Mon, 1 Dec 2025 12:03:40 -0500
Subject: [PATCH 30/50] let tokenizers be >= 0.22.0

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7249477..40ae609 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -98,7 +98,7 @@ SQLAlchemy==2.0.43
 sympy==1.14.0
 threadpoolctl==3.6.0
 tifffile==2025.5.10
-tokenizers==0.21.1
+tokenizers>=0.22.0
 tomli==2.2.1
 toolz==1.0.0
 torch==2.7.0

From 5b4ef60f4ff3359f2820b92946118db752491aaf Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Mon, 1 Dec 2025 12:06:35 -0500
Subject: [PATCH 31/50] hf hub >= 0.32.0

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 40ae609..aa7cd60 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,7 +28,7 @@ gguf==0.16.3
 greenlet==3.2.4
 hf-xet==1.1.2
 hf_transfer==0.1.9
-huggingface-hub==0.32.0
+huggingface-hub>=0.32.0
 idna==3.10
 imageio==2.37.0
 importlib_metadata==8.7.0

From 32e831d5382ebc11d31c5d801a7ead42246ecfac Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Mon, 1 Dec 2025 12:10:42 -0500
Subject: [PATCH 32/50] hf-xet>=1.1.2

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index aa7cd60..61e4f7f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,7 +26,7 @@ frozenlist==1.6.0
 fsspec==2025.3.0
 gguf==0.16.3
 greenlet==3.2.4
-hf-xet==1.1.2
+hf-xet>=1.1.2
 hf_transfer==0.1.9
 huggingface-hub>=0.32.0
 idna==3.10

From 006cf16be47170388d5aa45e1fcbec5889a1db7d Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Thu, 4 Dec 2025 12:26:32 -0500
Subject: [PATCH 33/50] print target and predicted structures if
 PRINT_STRUCTURES=1

---
 atomgpt/inverse_models/inverse_models.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index b9e337c..a672d2f 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -293,6 +293,12 @@ def evaluate(
                 miss_writer.writerow([sample_id, "prediction", "invalid_prediction", gen_err, ""])
                 continue
 
+            if os.environ.get("PRINT_STRUCTURES"):
+                print("Target Structure:")
+                print(target_mat)
+                print("Predicted Structure:")
+                print(gen_mat)
+
             try:
                 ok_writer.writerow([
                     sample_id,

From 786ddacae193f07e87c7475caf5cc39525d50b30 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Thu, 4 Dec 2025 12:36:21 -0500
Subject: [PATCH 34/50] mv print statements before validation checks

---
 atomgpt/inverse_models/inverse_models.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index a672d2f..80f9e69 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -263,11 +263,17 @@ def evaluate(
             target_err = None
             try:
                 target_mat = text2atoms("\n" + i["output"])
+                if os.environ.get("PRINT_STRUCTURES"):
+                    print(f"Target Structure ({sample_id}):")
+                    print(target_mat)
+
                 ok, detail = _validate_atoms(target_mat)
                 if not ok:
                     target_err = detail
             except Exception as e:
                 target_err = f"text2atoms:{type(e).__name__}:{e}"
+                if os.environ.get("PRINT_STRUCTURES"):
+                    print(f"Target Structure ({sample_id}) FAILED: {target_err}")
 
             if target_err:
                 miss_writer.writerow([sample_id, "target", "invalid_target", target_err, (i.get("output","")[:240])])
@@ -283,22 +289,22 @@ def evaluate(
                     alpaca_prompt=config.alpaca_prompt,
                     instruction=config.instruction,
                 )
+                if os.environ.get("PRINT_STRUCTURES"):
+                    print(f"Predicted Structure ({sample_id}):")
+                    print(gen_mat)
+
                 ok, detail = _validate_atoms(gen_mat)
                 if not ok:
                     gen_err = detail
             except Exception as e:
                 gen_err = f"gen_atoms:{type(e).__name__}:{e}"
+                if os.environ.get("PRINT_STRUCTURES"):
+                    print(f"Predicted Structure ({sample_id}) FAILED: {gen_err}")
 
             if gen_err:
                 miss_writer.writerow([sample_id, "prediction", "invalid_prediction", gen_err, ""])
                 continue
 
-            if os.environ.get("PRINT_STRUCTURES"):
-                print("Target Structure:")
-                print(target_mat)
-                print("Predicted Structure:")
-                print(gen_mat)
-
             try:
                 ok_writer.writerow([
                     sample_id,

From 76602a662e062e1c4a996b39c9f1dc09dffcd6e9 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Thu, 4 Dec 2025 12:47:30 -0500
Subject: [PATCH 35/50] let the raw LLM output be printed if PRINT_STRUCTURES=1

---
 atomgpt/inverse_models/inverse_models.py | 5 ++++-
 atomgpt/inverse_models/utils.py          | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index 80f9e69..a24e998 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -281,8 +281,9 @@ def evaluate(
 
             gen_mat = None
             gen_err = None
+            raw_response = ""
             try:
-                gen_mat = gen_atoms(
+                gen_mat, raw_response = gen_atoms(
                     prompt=i["input"],
                     tokenizer=tokenizer,
                     model=model,
@@ -300,6 +301,8 @@ def evaluate(
                 gen_err = f"gen_atoms:{type(e).__name__}:{e}"
                 if os.environ.get("PRINT_STRUCTURES"):
                     print(f"Predicted Structure ({sample_id}) FAILED: {gen_err}")
+                    print(f"Raw LLM Output ({sample_id}):")
+                    print(raw_response)
 
             if gen_err:
                 miss_writer.writerow([sample_id, "prediction", "invalid_prediction", gen_err, ""])
diff --git a/atomgpt/inverse_models/utils.py b/atomgpt/inverse_models/utils.py
index b5d441f..6395b24 100644
--- a/atomgpt/inverse_models/utils.py
+++ b/atomgpt/inverse_models/utils.py
@@ -175,7 +175,7 @@ def gen_atoms(
 
         print(exp)
         pass
-    return atoms
+    return atoms, response
 
 
 def get_crystal_string_t(atoms):
@@ -381,7 +381,7 @@ def main_spectra(
             + " Generate atomic structure description with lattice lengths, angles, coordinates and atom types."
         )
         # print(info)
-        atoms = gen_atoms(
+        atoms, _ = gen_atoms(
             prompt=info["input"],
             model=model,
             alpaca_prompt=alpaca_prompt,

From 33e52c87ab49b159a0adaa7b8a6badc306e517b5 Mon Sep 17 00:00:00 2001
From: crhysc <crc00042@mix.wvu.edu>
Date: Mon, 8 Dec 2025 20:45:01 -0500
Subject: [PATCH 36/50] initialize abs factory for loading. add chat template
 stubs

---
 atomgpt/inverse_models/factories.py      | 152 +++++++++++++++++++++++
 atomgpt/inverse_models/inverse_models.py |  76 ++----------
 atomgpt/inverse_models/products.py       |  16 +++
 3 files changed, 181 insertions(+), 63 deletions(-)
 create mode 100644 atomgpt/inverse_models/factories.py
 create mode 100644 atomgpt/inverse_models/products.py

diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py
new file mode 100644
index 0000000..6e0d0ef
--- /dev/null
+++ b/atomgpt/inverse_models/factories.py
@@ -0,0 +1,152 @@
+# factories.py
+
+from abc import ABC, abstractmethod
+from .products import LoadedModel, ChatTemplate
+from .inverse_models import TrainingPropConfig
+from peft import PeftModel
+from .loader import FastLanguageModel as AtomGPTFastLanguageModel
+from unsloth import FastLanguageModel as UnslothFastLanguageModel
+from typing import Dict
+
+
+class LanguageModelFactory(ABC):
+    @abstractmethod
+    def load_for_training(self, config: TrainingPropConfig) -> LoadedModel:
+        pass
+
+    @abstractmethod
+    def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -> LoadedModel:
+        pass
+
+    @abstractmethod
+    def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate:
+        pass
+
+
+class AlpacaTemplate:
+    def format(self, instruction: str, user_input: str, output: str | None = None) -> str:
+        if output is None:
+            output = ""
+        return f"### Instruction:\n{instruction}\n### Input:\n{user_input}\n### Output:\n{output}"
+
+
+class HarmonyTemplate:
+    def format(self, instruction: str, user_input: str, output: str | None = None) -> str:
+        pass
+
+
+class AtomGPTFactory(LanguageModelFactory):
+    def load_for_training(self, config: TrainingPropConfig) -> LoadedModel:
+        model, tokenizer = AtomGPTFastLanguageModel.from_pretrained(
+            model_name=config.model_name,
+            max_seq_length=config.max_seq_length,
+            dtype=config.dtype,
+            load_in_4bit=config.load_in_4bit
+        )
+        if not isinstance(model, PeftModel):
+            # import sys
+            print("Not yet a peft model, converting into peft model")
+            # sys.exit()
+            model = FastLanguageModel.get_peft_model(
+                model,
+                r=config.lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+                target_modules=[
+                    "q_proj",
+                    "k_proj",
+                    "v_proj",
+                    "o_proj",
+                    "gate_proj",
+                    "up_proj",
+                    "down_proj",
+                ],
+                lora_alpha=config.lora_alpha,
+                lora_dropout=0,  # Supports any, but = 0 is optimized
+                bias="none",  # Supports any, but = "none" is optimized
+                use_gradient_checkpointing=True,
+                random_state=3407,
+                use_rslora=False,  # We support rank stabilized LoRA
+                loftq_config=None,  # And LoftQ
+            )
+            print("Peft model created")
+        EOS_TOKEN = tokenizer.eos_token
+        return LoadedModel(model=model, tokenizer=tokenizer)
+
+    def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -> LoadedModel:
+        model, tokenizer = FastLanguageModel.from_pretrained(
+            model_name=checkpoint_path,
+            max_seq_length=config.max_seq_length,
+            dtype=config.dtype,
+            load_in_4bit=config.load_in_4bit,
+        )
+        FastLanguageModel.for_inference(model)
+        return LoadedModel(model=model, tokenizer=tokenizer))
+
+    def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate:
+        return AlpacaTemplate()
+
+
+class GPTOSSFactory(LanguageModelFactory):
+    def load_for_training(self, config: TrainingPropConfig) -> LoadedModel:
+        model, tokenizer = UnslothFastLanguageModel.from_pretrained(
+            model_name=config.model_name,
+            max_seq_length=config.max_seq_length,
+            dtype=config.dtype,
+            load_in_4bit=config.load_in_4bit,
+            full_finetuning = False,
+        )
+        if not isinstance(model, PeftModel):
+            print("Not yet a peft model, converting into peft model")
+            model = FastLanguageModel.get_peft_model(
+                model,
+                r=config.lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+                target_modules=[
+                    "q_proj",
+                    "k_proj",
+                    "v_proj",
+                    "o_proj",
+                    "gate_proj",
+                    "up_proj",
+                    "down_proj",
+                ],
+                lora_alpha=config.lora_alpha,
+                lora_dropout=0,  # Supports any, but = 0 is optimized
+                bias="none",  # Supports any, but = "none" is optimized
+                use_gradient_checkpointing=unsloth,
+                random_state=3407,
+                use_rslora=False,  # We support rank stabilized LoRA
+                loftq_config=None,  # And LoftQ
+            )
+            print("Peft model created")
+        return LoadedModel(model=model, tokenizer=tokenizer)
+
+    def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -> LoadedModel:
+        model, tokenizer = UnslothFastLanguageModel.from_pretrained(
+            model_name=checkpoint_path,
+            max_seq_length=config.max_seq_length,
+            dtype=config.dtype,
+            load_in_4bit=config.load_in_4bit,
+        )
+        FastLanguageModel.for_inference(model)
+        return LoadedModel(model=model, tokenizer=tokenizer)
+    
+    def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate:
+        return HarmonyTemplate()
+
+
+FACTORY_REGISTRY: Dict[str, type[LanguageModelFactory]] = {
+    "gemma": AtomGPTFactory,
+    "qwen": AtomGPTFactory,
+    "Meta": AtomGPTFactory,
+    "Llama": AtomGPTFactory,
+    "llama": AtomGPTFactory,
+    "Mistral": AtomGPTFactory,
+    "mistral": AtomGPTFactory,
+    "gpt-oss": GPTOssFactory,
+}
+
+def get_lm_factory(config: TrainingPropConfig) -> LanguageModelFactory:
+    model_name = config.model_name
+    factory_cls = FACTORY_REGISTRY.get(model_name.split("/", 1)[1].split("-", 1)[0])
+    if factory_cls is None:
+        raise ValueError(f"Unsupported model name: {model_name}. No model factory found.")
+    return factory_cls()
diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index a24e998..08f2f29 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -1,5 +1,9 @@
 from typing import Optional
+from typing import Dict
+from typing import Literal
 from atomgpt.inverse_models.loader import FastLanguageModel
+from .factories import LanguageModelFactory, get_lm_factory
+from .products import LoadedModel, ChatTemplate
 
 # from unsloth import FastLanguageModel
 from atomgpt.inverse_models.callbacks import (
@@ -32,9 +36,9 @@
 import sys
 import json
 import argparse
-from typing import Literal
 import time
 from jarvis.core.composition import Composition
+import traceback
 
 # from atomgpt.inverse_models.custom_trainer import CustomSFTTrainer
 
@@ -301,6 +305,7 @@ def evaluate(
                 gen_err = f"gen_atoms:{type(e).__name__}:{e}"
                 if os.environ.get("PRINT_STRUCTURES"):
                     print(f"Predicted Structure ({sample_id}) FAILED: {gen_err}")
+                    print(traceback.format_exc())
                     print(f"Raw LLM Output ({sample_id}):")
                     print(raw_response)
 
@@ -521,38 +526,10 @@ def main(config_file=None):
         print(alpaca_prop_test_filename, "exists")
         m_test = loadjson(alpaca_prop_test_filename)
 
-    # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
-    model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name=config.model_name,  # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
-        max_seq_length=config.max_seq_length,
-        dtype=config.dtype,
-        load_in_4bit=config.load_in_4bit,
-        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
-    )
-    if not isinstance(model, PeftModel):
-        # import sys
-        print("Not Peft model")
-        # sys.exit()
-        model = FastLanguageModel.get_peft_model(
-            model,
-            r=config.lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
-            target_modules=[
-                "q_proj",
-                "k_proj",
-                "v_proj",
-                "o_proj",
-                "gate_proj",
-                "up_proj",
-                "down_proj",
-            ],
-            lora_alpha=config.lora_alpha,
-            lora_dropout=0,  # Supports any, but = 0 is optimized
-            bias="none",  # Supports any, but = "none" is optimized
-            use_gradient_checkpointing=True,
-            random_state=3407,
-            use_rslora=False,  # We support rank stabilized LoRA
-            loftq_config=None,  # And LoftQ
-        )
+    factory = get_lm_factory(config)
+    loaded: LoadedModel = factory.load_for_training(config)
+    model, tokenizer = loaded.model, loaded.tokenizer
+    chat_template = factory.create_chat_template(config)
 
     EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
     # tokenizer.pad_token_id = tokenizer.eos_token_id
@@ -647,8 +624,6 @@ def tokenize_function(example):
     trainer = SFTTrainer(
         model=model,
         train_dataset=tokenized_train,
-        # train_dataset = train_dataset,
-        # tokenizer = tokenizer,
         args=SFTConfig(
             dataset_text_field="text",
             max_seq_length=config.max_seq_length,
@@ -672,44 +647,19 @@ def tokenize_function(example):
     if callback_samples > 0:
         callback = ExampleTrainerCallback(
             some_tokenized_dataset=tokenized_eval,
-            # some_tokenized_dataset=tokenized_eval,
             tokenizer=tokenizer,
             max_length=config.max_seq_length,
             callback_samples=callback_samples,
         )
         trainer.add_callback(callback)
+    
     gpu_usage = PrintGPUUsageCallback()
     trainer.add_callback(gpu_usage)
-    trainer_stats = trainer.train()
+    trainer_stats = trainer.train(resume_from_checkpoint=True)
     trainer.save_model(config.model_save_path)
-    # model.save_pretrained(config.model_save_path)
 
-    # model, tokenizer = FastLanguageModel.from_pretrained(
-    #    model_name=config.model_save_path,  # YOUR MODEL YOU USED FOR TRAINING
-    #    max_seq_length=config.max_seq_length,
-    #    dtype=config.dtype,
-    #    load_in_4bit=config.load_in_4bit,
-    # )
     model = trainer.model
-    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
-    # model, tokenizer, config = load_model(path=config.model_save_path)
-    # batch_evaluate(
-    #   prompts=[i["input"] for i in m_test],
-    #   model=model,
-    #   tokenizer=tokenizer,
-    #   csv_out=config.csv_out,
-    #   config=config,
-    # )
-    # t1 = time.time()
-    # batch_evaluate(
-    #    test_set=m_test,
-    #    model=model,
-    #    tokenizer=tokenizer,
-    #    csv_out=config.csv_out,
-    #    config=config,
-    # )
-    # t2 = time.time()
-    # t1a = time.time()
+    FastLanguageModel.for_inference(model)
     evaluate(
         test_set=m_test,
         model=model,
diff --git a/atomgpt/inverse_models/products.py b/atomgpt/inverse_models/products.py
new file mode 100644
index 0000000..2f12923
--- /dev/null
+++ b/atomgpt/inverse_models/products.py
@@ -0,0 +1,16 @@
+# products.py
+
+from dataclasses import dataclass
+from typing import Protocol, Any
+import torch
+from transformers import PreTrainedTokenizerBase
+
+@dataclass
+class LoadedModel:
+    model: torch.nn.Module
+    tokenizer: PreTrainedTokenizerBase
+
+
+class ChatTemplate(Protocol):
+    def format() -> str:
+        pass

From d0dd79354b1bfee51cc82f32f3542f93c8d00e67 Mon Sep 17 00:00:00 2001
From: crhysc <crc00042@mix.wvu.edu>
Date: Mon, 8 Dec 2025 20:46:24 -0500
Subject: [PATCH 37/50] add kwargs to format()

---
 atomgpt/inverse_models/products.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/atomgpt/inverse_models/products.py b/atomgpt/inverse_models/products.py
index 2f12923..08aae56 100644
--- a/atomgpt/inverse_models/products.py
+++ b/atomgpt/inverse_models/products.py
@@ -12,5 +12,5 @@ class LoadedModel:
 
 
 class ChatTemplate(Protocol):
-    def format() -> str:
+    def format(self, instruction: str, user_input: str, output: str | None = None) -> str:
         pass

From 1970a36fa401cf3d0abbfa3136edb2e49befb38e Mon Sep 17 00:00:00 2001
From: crhysc <crc00042@mix.wvu.edu>
Date: Sun, 14 Dec 2025 04:25:09 -0500
Subject: [PATCH 38/50] get harmony template in factory

---
 atomgpt/inverse_models/dataset_utils.py  | 65 +++++++++++++++++++-
 atomgpt/inverse_models/factories.py      | 51 +++++++---------
 atomgpt/inverse_models/inverse_models.py | 75 ++----------------------
 atomgpt/inverse_models/products.py       |  4 +-
 4 files changed, 91 insertions(+), 104 deletions(-)

diff --git a/atomgpt/inverse_models/dataset_utils.py b/atomgpt/inverse_models/dataset_utils.py
index 8bf6b81..9e047ec 100644
--- a/atomgpt/inverse_models/dataset_utils.py
+++ b/atomgpt/inverse_models/dataset_utils.py
@@ -753,6 +753,67 @@ def _tokenize(example):
         )
     pass
     return dataset
-
-
 pass
+
+def make_alpaca_json(
+    dataset=[],
+    jids=[],
+    # prop="Tc_supercon",
+    # instruction="",
+    include_jid=False,
+    # chem_info="",
+    # output_prompt="",
+    config=None,
+):
+    mem = []
+    print("config.prop", config.prop)
+    for i in dataset:
+        if i[config.prop] != "na" and i[config.id_tag] in jids:
+            atoms = Atoms.from_dict(i["atoms"])
+            info = {}
+            if include_jid:
+                info["id"] = i[config.id_tag]
+            info["instruction"] = config.instruction
+            if config.chem_info == "none":
+                chem = ""
+            elif config.chem_info == "element_list":
+                chem = atoms.composition.search_string
+            elif config.chem_info == "element_dict":
+                comp = Composition.from_string(
+                    atoms.composition.reduced_formula
+                )
+                chem = comp.to_dict()
+                chem = str(dict(sorted(chem.items())))
+            elif config.chem_info == "formula":
+                chem = atoms.composition.reduced_formula
+
+            inp = get_input(config=config, val=i[config.prop], chem=chem)
+            info["input"] = inp
+
+            info["output"] = get_crystal_string_t(atoms)
+            mem.append(info)
+    return mem
+
+def alpaca_formatting_prompts_func(examples: Dict[str, Any], alpaca_prompt: str, eos_token: str) -> Dict[str, List[str]]:
+    inst = examples["instruction"]
+    inp  = examples["input"]
+    out  = examples["output"]
+    texts = [alpaca_prompt.format(i, x, y) + eos_token for i, x, y in zip(inst, inp, out)]
+    return {"text": texts}
+
+def harmony_formatting_prompts_func(examples: Dict[str, Any], tokenizer) -> Dict[str, List[str]]:
+    inst = examples["instruction"]
+    inp  = examples["input"]
+    out  = examples["output"]
+    texts: List[str] = []
+    for i, x, y in zip(inst, inp, out):
+        messages = []
+        i = (i or "").strip()
+        x = (x or "").strip()
+        y = (y or "").strip()
+        if i:
+            messages.append({"role": "developer", "content": i})
+        messages.append({"role": "user", "content": x})
+        messages.append({"role": "assistant", "content": y})
+        texts.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False))
+    return {"text": texts}
diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py
index 6e0d0ef..c9b5c2d 100644
--- a/atomgpt/inverse_models/factories.py
+++ b/atomgpt/inverse_models/factories.py
@@ -2,11 +2,16 @@
 
 from abc import ABC, abstractmethod
 from .products import LoadedModel, ChatTemplate
+from typing import Callable
 from .inverse_models import TrainingPropConfig
 from peft import PeftModel
 from .loader import FastLanguageModel as AtomGPTFastLanguageModel
 from unsloth import FastLanguageModel as UnslothFastLanguageModel
 from typing import Dict
+from .dataset_utils import alpaca_formatting_prompts_func
+from .dataset_utils import harmony_formatting_prompts_func
+from functools import partial
+from typing import List
 
 
 class LanguageModelFactory(ABC):
@@ -19,19 +24,7 @@ def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -
         pass
 
     @abstractmethod
-    def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate:
-        pass
-
-
-class AlpacaTemplate:
-    def format(self, instruction: str, user_input: str, output: str | None = None) -> str:
-        if output is None:
-            output = ""
-        return f"### Instruction:\n{instruction}\n### Input:\n{user_input}\n### Output:\n{output}"
-
-
-class HarmonyTemplate:
-    def format(self, instruction: str, user_input: str, output: str | None = None) -> str:
+    def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable:
         pass
 
 
@@ -47,7 +40,7 @@ def load_for_training(self, config: TrainingPropConfig) -> LoadedModel:
             # import sys
             print("Not yet a peft model, converting into peft model")
             # sys.exit()
-            model = FastLanguageModel.get_peft_model(
+            model = AtomGPTFastLanguageModel.get_peft_model(
                 model,
                 r=config.lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
                 target_modules=[
@@ -78,11 +71,12 @@ def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -
             dtype=config.dtype,
             load_in_4bit=config.load_in_4bit,
         )
-        FastLanguageModel.for_inference(model)
-        return LoadedModel(model=model, tokenizer=tokenizer))
+        AtomGPTFastLanguageModel.for_inference(model)
+        return LoadedModel(model=model, tokenizer=tokenizer)
 
-    def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate:
-        return AlpacaTemplate()
+    def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable:
+        eos = tokenizer.eos_token or "</s>"
+        return partial(alpaca_formatting_prompts_func, alpaca_prompt=config.alpaca_prompt, eos_token=eos)
 
 
 class GPTOSSFactory(LanguageModelFactory):
@@ -96,7 +90,7 @@ def load_for_training(self, config: TrainingPropConfig) -> LoadedModel:
         )
         if not isinstance(model, PeftModel):
             print("Not yet a peft model, converting into peft model")
-            model = FastLanguageModel.get_peft_model(
+            model = UnslothFastLanguageModel.get_peft_model(
                 model,
                 r=config.lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
                 target_modules=[
@@ -111,7 +105,7 @@ def load_for_training(self, config: TrainingPropConfig) -> LoadedModel:
                 lora_alpha=config.lora_alpha,
                 lora_dropout=0,  # Supports any, but = 0 is optimized
                 bias="none",  # Supports any, but = "none" is optimized
-                use_gradient_checkpointing=unsloth,
+                use_gradient_checkpointing=True,
                 random_state=3407,
                 use_rslora=False,  # We support rank stabilized LoRA
                 loftq_config=None,  # And LoftQ
@@ -126,12 +120,11 @@ def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -
             dtype=config.dtype,
             load_in_4bit=config.load_in_4bit,
         )
-        FastLanguageModel.for_inference(model)
+        UnslothFastLanguageModel.for_inference(model)
         return LoadedModel(model=model, tokenizer=tokenizer)
     
-    def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate:
-        return HarmonyTemplate()
-
+    def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable:
+        return partial(harmony_formatting_prompts_func, tokenizer=tokenizer)
 
 FACTORY_REGISTRY: Dict[str, type[LanguageModelFactory]] = {
     "gemma": AtomGPTFactory,
@@ -141,12 +134,12 @@ def create_chat_template(self, config: TrainingPropConfig) -> ChatTemplate:
     "llama": AtomGPTFactory,
     "Mistral": AtomGPTFactory,
     "mistral": AtomGPTFactory,
-    "gpt-oss": GPTOssFactory,
+    "gpt-oss": GPTOSSFactory,
 }
 
 def get_lm_factory(config: TrainingPropConfig) -> LanguageModelFactory:
     model_name = config.model_name
-    factory_cls = FACTORY_REGISTRY.get(model_name.split("/", 1)[1].split("-", 1)[0])
-    if factory_cls is None:
-        raise ValueError(f"Unsupported model name: {model_name}. No model factory found.")
-    return factory_cls()
+    if "gpt-oss" in model_name:
+        return GPTOSSFactory()
+    else:
+        return AtomGPTFactory()
diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index 08f2f29..169cb0c 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -145,62 +145,6 @@ def get_input(config=None, chem="", val=10):
     )
     return inp
 
-
-def make_alpaca_json(
-    dataset=[],
-    jids=[],
-    # prop="Tc_supercon",
-    # instruction="",
-    include_jid=False,
-    # chem_info="",
-    # output_prompt="",
-    config=None,
-):
-    mem = []
-    print("config.prop", config.prop)
-    for i in dataset:
-        if i[config.prop] != "na" and i[config.id_tag] in jids:
-            atoms = Atoms.from_dict(i["atoms"])
-            info = {}
-            if include_jid:
-                info["id"] = i[config.id_tag]
-            info["instruction"] = config.instruction
-            if config.chem_info == "none":
-                chem = ""
-            elif config.chem_info == "element_list":
-                chem = atoms.composition.search_string
-            elif config.chem_info == "element_dict":
-                comp = Composition.from_string(
-                    atoms.composition.reduced_formula
-                )
-                chem = comp.to_dict()
-                chem = str(dict(sorted(chem.items())))
-            elif config.chem_info == "formula":
-                chem = atoms.composition.reduced_formula
-
-            inp = get_input(config=config, val=i[config.prop], chem=chem)
-            info["input"] = inp
-
-            info["output"] = get_crystal_string_t(atoms)
-            mem.append(info)
-    return mem
-
-
-def formatting_prompts_func(examples, alpaca_prompt):
-    instructions = examples["instruction"]
-    inputs = examples["input"]
-    outputs = examples["output"]
-    texts = []
-    EOS_TOKEN = "</s>"
-    for instruction, input, output in zip(instructions, inputs, outputs):
-        # Must add EOS_TOKEN, otherwise your generation will go on forever!
-        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
-        texts.append(text)
-    return {
-        "text": texts,
-    }
-
-
 def load_model(path="", config=None):
     if config is None:
         config_file = os.path.join(path, "config.json")
@@ -410,7 +354,6 @@ def batch_evaluate(
 
 def main(config_file=None):
     if config_file is None:
-
         args = parser.parse_args(sys.argv[1:])
         config_file = args.config_name
     if not torch.cuda.is_available():
@@ -437,17 +380,13 @@ def main(config_file=None):
     run_path = os.path.dirname(id_prop_path)
     num_train = config.num_train
     num_test = config.num_test
-    # model_name = config.model_name
     callback_samples = config.callback_samples
-    # loss_function = config.loss_function
-    # id_prop_path = os.path.join(run_path, id_prop_path)
     with open(id_prop_path, "r") as f:
         reader = csv.reader(f)
         dt = [row for row in reader]
     if not num_train:
         num_test = int(len(dt) * config.test_ratio)
         num_train = len(dt) - num_test
-
     dat = []
     ids = []
     for i in tqdm(dt, total=len(dt)):
@@ -486,7 +425,7 @@ def main(config_file=None):
     print("num_train", num_train)
     print("num_test", num_test)
     test_ids = ids[num_train : num_train + num_test]
-    # test_ids = ids[num_train:]
+
     alpaca_prop_train_filename = os.path.join(
         config.output_dir, "alpaca_prop_train.json"
     )
@@ -529,11 +468,8 @@ def main(config_file=None):
     factory = get_lm_factory(config)
     loaded: LoadedModel = factory.load_for_training(config)
     model, tokenizer = loaded.model, loaded.tokenizer
-    chat_template = factory.create_chat_template(config)
+    formatting_prompts_func = factory.get_formatting_prompts_func(config, model, tokenizer)
 
-    EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
-    # tokenizer.pad_token_id = tokenizer.eos_token_id
-    # model.resize_token_embeddings(len(tokenizer))
     train_dataset = load_dataset(
         "json",
         data_files=alpaca_prop_train_filename,
@@ -546,9 +482,6 @@ def main(config_file=None):
         split="train",
         # "json", data_files="alpaca_prop_train.json", split="train"
     )
-    formatting_prompts_func_with_prompt = partial(
-        formatting_prompts_func, alpaca_prompt=config.alpaca_prompt
-    )
 
     def tokenize_function(example):
         return tokenizer(
@@ -559,12 +492,12 @@ def tokenize_function(example):
         )
 
     train_dataset = train_dataset.map(
-        formatting_prompts_func_with_prompt,
+        formatting_prompts_func,
         batched=True,
         num_proc=config.dataset_num_proc
     )
     eval_dataset = eval_dataset.map(
-        formatting_prompts_func_with_prompt,
+        formatting_prompts_func,
         batched=True,
         num_proc=config.dataset_num_proc
     )
diff --git a/atomgpt/inverse_models/products.py b/atomgpt/inverse_models/products.py
index 08aae56..3de3171 100644
--- a/atomgpt/inverse_models/products.py
+++ b/atomgpt/inverse_models/products.py
@@ -11,6 +11,6 @@ class LoadedModel:
     tokenizer: PreTrainedTokenizerBase
 
 
-class ChatTemplate(Protocol):
-    def format(self, instruction: str, user_input: str, output: str | None = None) -> str:
+class DatasetFormattingFunction(Protocol):
+    def get_formatting_prompts_func() -> function:
         pass

From 11b2ab6eefee862285d46d7315c5bd4f0aa5a02c Mon Sep 17 00:00:00 2001
From: crhysc <crc00042@mix.wvu.edu>
Date: Sun, 14 Dec 2025 04:29:27 -0500
Subject: [PATCH 39/50] from typing import Any

---
 atomgpt/inverse_models/dataset_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/atomgpt/inverse_models/dataset_utils.py b/atomgpt/inverse_models/dataset_utils.py
index 9e047ec..535df28 100644
--- a/atomgpt/inverse_models/dataset_utils.py
+++ b/atomgpt/inverse_models/dataset_utils.py
@@ -6,6 +6,7 @@
 
 from typing import Union, Callable, Optional, List, Dict
 import torch
+from typing import Any
 
 
 # From https://www.geeksforgeeks.org/longest-common-substring-array-strings/

From 52228edf97fde0003f4b9c74ac179a9e8fea54a3 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Sun, 14 Dec 2025 04:33:08 -0500
Subject: [PATCH 40/50] remove relative import

---
 atomgpt/inverse_models/factories.py      | 10 +++++-----
 atomgpt/inverse_models/inverse_models.py |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py
index c9b5c2d..892aeca 100644
--- a/atomgpt/inverse_models/factories.py
+++ b/atomgpt/inverse_models/factories.py
@@ -1,15 +1,15 @@
 # factories.py
 
 from abc import ABC, abstractmethod
-from .products import LoadedModel, ChatTemplate
+from atomgpt.inverse_models.products import LoadedModel, ChatTemplate
 from typing import Callable
-from .inverse_models import TrainingPropConfig
+from atomgpt.inverse_models.inverse_models import TrainingPropConfig
 from peft import PeftModel
-from .loader import FastLanguageModel as AtomGPTFastLanguageModel
+from atomgpt.inverse_models.loader import FastLanguageModel as AtomGPTFastLanguageModel
 from unsloth import FastLanguageModel as UnslothFastLanguageModel
 from typing import Dict
-from .dataset_utils import alpaca_formatting_prompts_func
-from .dataset_utils import harmony_formatting_prompts_func
+from atomgpt.inverse_models.dataset_utils import alpaca_formatting_prompts_func
+from atomgpt.inverse_models.dataset_utils import harmony_formatting_prompts_func
 from functools import partial
 from typing import List
 
diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index 169cb0c..9c3b8ca 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -2,8 +2,8 @@
 from typing import Dict
 from typing import Literal
 from atomgpt.inverse_models.loader import FastLanguageModel
-from .factories import LanguageModelFactory, get_lm_factory
-from .products import LoadedModel, ChatTemplate
+from atomgpt.inverse_models.factories import LanguageModelFactory, get_lm_factory
+from atomgpt.inverse_models.products import LoadedModel, ChatTemplate
 
 # from unsloth import FastLanguageModel
 from atomgpt.inverse_models.callbacks import (

From 1464ce13aec7b183feb249d2a506c875623a1551 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Sun, 14 Dec 2025 04:35:18 -0500
Subject: [PATCH 41/50] import callable

---
 atomgpt/inverse_models/products.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/atomgpt/inverse_models/products.py b/atomgpt/inverse_models/products.py
index 3de3171..6004f81 100644
--- a/atomgpt/inverse_models/products.py
+++ b/atomgpt/inverse_models/products.py
@@ -1,7 +1,7 @@
 # products.py
 
 from dataclasses import dataclass
-from typing import Protocol, Any
+from typing import Protocol, Any, Callable
 import torch
 from transformers import PreTrainedTokenizerBase
 
@@ -12,5 +12,5 @@ class LoadedModel:
 
 
 class DatasetFormattingFunction(Protocol):
-    def get_formatting_prompts_func() -> function:
+    def get_formatting_prompts_func() -> Callable:
         pass

From 9793ad838b5155e0b206fbaf6684ba6a28a4faa6 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Sun, 14 Dec 2025 04:36:25 -0500
Subject: [PATCH 42/50] remove import chattemplate

---
 atomgpt/inverse_models/factories.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py
index 892aeca..988bdfd 100644
--- a/atomgpt/inverse_models/factories.py
+++ b/atomgpt/inverse_models/factories.py
@@ -1,7 +1,7 @@
 # factories.py
 
 from abc import ABC, abstractmethod
-from atomgpt.inverse_models.products import LoadedModel, ChatTemplate
+from atomgpt.inverse_models.products import LoadedModel
 from typing import Callable
 from atomgpt.inverse_models.inverse_models import TrainingPropConfig
 from peft import PeftModel

From 3cfc59a71daa149a7c74c325a261e14d718b2720 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Sun, 14 Dec 2025 04:40:13 -0500
Subject: [PATCH 43/50] remove imports to non-interface objects for model
 loading and chat templates

---
 atomgpt/inverse_models/inverse_models.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index 9c3b8ca..8cbdc1e 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -1,11 +1,8 @@
 from typing import Optional
 from typing import Dict
 from typing import Literal
-from atomgpt.inverse_models.loader import FastLanguageModel
-from atomgpt.inverse_models.factories import LanguageModelFactory, get_lm_factory
-from atomgpt.inverse_models.products import LoadedModel, ChatTemplate
+from atomgpt.inverse_models.factories import get_lm_factory
 
-# from unsloth import FastLanguageModel
 from atomgpt.inverse_models.callbacks import (
     PrintGPUUsageCallback,
     ExampleTrainerCallback,

From 6e8771ae61beb0e5039977a7e826de0d015562f6 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Sun, 14 Dec 2025 04:46:50 -0500
Subject: [PATCH 44/50] add type checking if statement for the
 trainingpropconfig import

---
 atomgpt/inverse_models/factories.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py
index 988bdfd..a48a2c6 100644
--- a/atomgpt/inverse_models/factories.py
+++ b/atomgpt/inverse_models/factories.py
@@ -3,7 +3,9 @@
 from abc import ABC, abstractmethod
 from atomgpt.inverse_models.products import LoadedModel
 from typing import Callable
-from atomgpt.inverse_models.inverse_models import TrainingPropConfig
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from atomgpt.inverse_models.inverse_models import TrainingPropConfig
 from peft import PeftModel
 from atomgpt.inverse_models.loader import FastLanguageModel as AtomGPTFastLanguageModel
 from unsloth import FastLanguageModel as UnslothFastLanguageModel

From 9a41780c2edeb7a5af461da480ea96a1d9fa74a9 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Sun, 14 Dec 2025 04:49:30 -0500
Subject: [PATCH 45/50] add unsloth>=2024.10,<2025.3

---
 requirements.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 61e4f7f..af7c05c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,9 +26,9 @@ frozenlist==1.6.0
 fsspec==2025.3.0
 gguf==0.16.3
 greenlet==3.2.4
-hf-xet>=1.1.2
+hf-xet==1.1.2
 hf_transfer==0.1.9
-huggingface-hub>=0.32.0
+huggingface-hub==0.32.0
 idna==3.10
 imageio==2.37.0
 importlib_metadata==8.7.0
@@ -112,6 +112,7 @@ typing-inspection==0.4.1
 typing_extensions==4.13.2
 tyro==0.9.21
 tzdata==2025.2
+unsloth>=2024.10,<2025.3
 urllib3==2.4.0
 uv==0.7.8
 xformers==0.0.30

From 25661fec062f183a907d9d81eaa11fb0b9b219c9 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Sun, 14 Dec 2025 05:13:51 -0500
Subject: [PATCH 46/50] arrange imports to debug

---
 atomgpt/inverse_models/factories.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py
index a48a2c6..44a6bf4 100644
--- a/atomgpt/inverse_models/factories.py
+++ b/atomgpt/inverse_models/factories.py
@@ -1,5 +1,7 @@
 # factories.py
 
+from __future__ import annotations
+
 from abc import ABC, abstractmethod
 from atomgpt.inverse_models.products import LoadedModel
 from typing import Callable
@@ -7,8 +9,6 @@
 if TYPE_CHECKING:
     from atomgpt.inverse_models.inverse_models import TrainingPropConfig
 from peft import PeftModel
-from atomgpt.inverse_models.loader import FastLanguageModel as AtomGPTFastLanguageModel
-from unsloth import FastLanguageModel as UnslothFastLanguageModel
 from typing import Dict
 from atomgpt.inverse_models.dataset_utils import alpaca_formatting_prompts_func
 from atomgpt.inverse_models.dataset_utils import harmony_formatting_prompts_func
@@ -32,6 +32,7 @@ def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable:
 
 class AtomGPTFactory(LanguageModelFactory):
     def load_for_training(self, config: TrainingPropConfig) -> LoadedModel:
+        from atomgpt.inverse_models.loader import FastLanguageModel as AtomGPTFastLanguageModel
         model, tokenizer = AtomGPTFastLanguageModel.from_pretrained(
             model_name=config.model_name,
             max_seq_length=config.max_seq_length,
@@ -83,6 +84,7 @@ def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable:
 
 class GPTOSSFactory(LanguageModelFactory):
     def load_for_training(self, config: TrainingPropConfig) -> LoadedModel:
+        from unsloth import FastLanguageModel as UnslothFastLanguageModel
         model, tokenizer = UnslothFastLanguageModel.from_pretrained(
             model_name=config.model_name,
             max_seq_length=config.max_seq_length,

From c8790f8f35bc55607f0585f3d9583412899fa3bd Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Sun, 14 Dec 2025 05:15:43 -0500
Subject: [PATCH 47/50] from atomgpt.inverse_models.dataset_utils import
 make_alpaca_json

---
 atomgpt/inverse_models/inverse_models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index 8cbdc1e..c0b634d 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -36,6 +36,7 @@
 import time
 from jarvis.core.composition import Composition
 import traceback
+from atomgpt.inverse_models.dataset_utils import make_alpaca_json
 
 # from atomgpt.inverse_models.custom_trainer import CustomSFTTrainer
 

From 780d334595fcc6b62d2707e691ae0acc082ba4bb Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Sun, 14 Dec 2025 05:18:25 -0500
Subject: [PATCH 48/50] add imports for make_alpaca_json()

---
 atomgpt/inverse_models/dataset_utils.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/atomgpt/inverse_models/dataset_utils.py b/atomgpt/inverse_models/dataset_utils.py
index 535df28..ef676b1 100644
--- a/atomgpt/inverse_models/dataset_utils.py
+++ b/atomgpt/inverse_models/dataset_utils.py
@@ -7,6 +7,15 @@
 from typing import Union, Callable, Optional, List, Dict
 import torch
 from typing import Any
+from jarvis.core.atoms import Atoms
+from jarvis.io.vasp.inputs import Poscar
+from jarvis.core.composition import Composition
+from atomgpt.inverse_models.utils import (
+    gen_atoms,
+    text2atoms,
+    get_crystal_string_t,
+    get_figlet,
+)
 
 
 # From https://www.geeksforgeeks.org/longest-common-substring-array-strings/

From 58bac34bdd53ee80e37b473882e4b91fd5a68e18 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Sun, 14 Dec 2025 05:20:02 -0500
Subject: [PATCH 49/50] mv get_input() to dataset_utils

---
 atomgpt/inverse_models/dataset_utils.py  | 33 ++++++++++++++++++++++++
 atomgpt/inverse_models/inverse_models.py | 33 ------------------------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/atomgpt/inverse_models/dataset_utils.py b/atomgpt/inverse_models/dataset_utils.py
index ef676b1..889260a 100644
--- a/atomgpt/inverse_models/dataset_utils.py
+++ b/atomgpt/inverse_models/dataset_utils.py
@@ -765,6 +765,39 @@ def _tokenize(example):
     return dataset
 pass
 
+def get_input(config=None, chem="", val=10):
+    if config.chem_info == "none":
+        prefix = ""
+    elif config.chem_info == "element_list":
+        prefix = (
+            "The chemical elements are "
+            + chem  # atoms.composition.search_string
+            + " . "
+        )
+    elif config.chem_info == "element_dict":
+        prefix = (
+            "The chemical contents are "
+            + chem  # atoms.composition.search_string
+            + " . "
+        )
+    elif config.chem_info == "formula":
+        prefix = (
+            "The chemical formula is "
+            + chem  # atoms.composition.reduced_formula
+            + " . "
+        )
+
+    inp = (
+        prefix
+        + "The  "
+        + config.prop
+        + " is "
+        + str(val)
+        + "."
+        + config.output_prompt
+    )
+    return inp
+
 def make_alpaca_json(
     dataset=[],
     jids=[],
diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index c0b634d..d7ee496 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -110,39 +110,6 @@ class TrainingPropConfig(BaseSettings):
     logging_steps: int = 10
 
 
-def get_input(config=None, chem="", val=10):
-    if config.chem_info == "none":
-        prefix = ""
-    elif config.chem_info == "element_list":
-        prefix = (
-            "The chemical elements are "
-            + chem  # atoms.composition.search_string
-            + " . "
-        )
-    elif config.chem_info == "element_dict":
-        prefix = (
-            "The chemical contents are "
-            + chem  # atoms.composition.search_string
-            + " . "
-        )
-    elif config.chem_info == "formula":
-        prefix = (
-            "The chemical formula is "
-            + chem  # atoms.composition.reduced_formula
-            + " . "
-        )
-
-    inp = (
-        prefix
-        + "The  "
-        + config.prop
-        + " is "
-        + str(val)
-        + "."
-        + config.output_prompt
-    )
-    return inp
-
 def load_model(path="", config=None):
     if config is None:
         config_file = os.path.join(path, "config.json")

From 73e3096a9823b6cc90dbf103eb168d2f6e272ad9 Mon Sep 17 00:00:00 2001
From: ccamp104 <ccamp104@dsailogin.mgmt.ai.cluster>
Date: Sun, 14 Dec 2025 05:22:38 -0500
Subject: [PATCH 50/50] rm resume from chkpt=true

---
 atomgpt/inverse_models/inverse_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py
index d7ee496..e343437 100644
--- a/atomgpt/inverse_models/inverse_models.py
+++ b/atomgpt/inverse_models/inverse_models.py
@@ -553,7 +553,7 @@ def tokenize_function(example):
     
     gpu_usage = PrintGPUUsageCallback()
     trainer.add_callback(gpu_usage)
-    trainer_stats = trainer.train(resume_from_checkpoint=True)
+    trainer_stats = trainer.train()
     trainer.save_model(config.model_save_path)
 
     model = trainer.model