diff --git a/atomgpt/inverse_models/dataset_utils.py b/atomgpt/inverse_models/dataset_utils.py index 8bf6b81..889260a 100644 --- a/atomgpt/inverse_models/dataset_utils.py +++ b/atomgpt/inverse_models/dataset_utils.py @@ -6,6 +6,16 @@ from typing import Union, Callable, Optional, List, Dict import torch +from typing import Any +from jarvis.core.atoms import Atoms +from jarvis.io.vasp.inputs import Poscar +from jarvis.core.composition import Composition +from atomgpt.inverse_models.utils import ( + gen_atoms, + text2atoms, + get_crystal_string_t, + get_figlet, +) # From https://www.geeksforgeeks.org/longest-common-substring-array-strings/ @@ -753,6 +763,100 @@ def _tokenize(example): ) pass return dataset +pass +def get_input(config=None, chem="", val=10): + if config.chem_info == "none": + prefix = "" + elif config.chem_info == "element_list": + prefix = ( + "The chemical elements are " + + chem # atoms.composition.search_string + + " . " + ) + elif config.chem_info == "element_dict": + prefix = ( + "The chemical contents are " + + chem # atoms.composition.search_string + + " . " + ) + elif config.chem_info == "formula": + prefix = ( + "The chemical formula is " + + chem # atoms.composition.reduced_formula + + " . " + ) -pass + inp = ( + prefix + + "The " + + config.prop + + " is " + + str(val) + + "." + + config.output_prompt + ) + return inp + +def make_alpaca_json( + dataset=[], + jids=[], + # prop="Tc_supercon", + # instruction="", + include_jid=False, + # chem_info="", + # output_prompt="", + config=None, +): + mem = [] + print("config.prop", config.prop) + for i in dataset: + if i[config.prop] != "na" and i[config.id_tag] in jids: + atoms = Atoms.from_dict(i["atoms"]) + info = {} + if include_jid: + info["id"] = i[config.id_tag] + info["instruction"] = config.instruction + if config.chem_info == "none": + chem = "" + elif config.chem_info == "element_list": + chem = atoms.composition.search_string + elif config.chem_info == "element_dict": + comp = Composition.from_string( + atoms.composition.reduced_formula + ) + chem = comp.to_dict() + chem = str(dict(sorted(chem.items()))) + elif config.chem_info == "formula": + chem = atoms.composition.reduced_formula + + inp = get_input(config=config, val=i[config.prop], chem=chem) + info["input"] = inp + + info["output"] = get_crystal_string_t(atoms) + mem.append(info) + return mem + +def alpaca_formatting_prompts_func(examples: Dict[str, Any], alpaca_prompt: str, eos_token: str) -> Dict[str, List[str]]: + inst = examples["instruction"] + inp = examples["input"] + out = examples["output"] + texts = [alpaca_prompt.format(i, x, y) + eos_token for i, x, y in zip(inst, inp, out)] + return {"text": texts} + +def harmony_formatting_prompts_func(examples: Dict[str, Any], tokenizer) -> Dict[str, List[str]]: + inst = examples["instruction"] + inp = examples["input"] + out = examples["output"] + texts: List[str] = [] + for i, x, y in zip(inst, inp, out): + messages = [] + i = (i or "").strip() + x = (x or "").strip() + y = (y or "").strip() + if i: + messages.append({"role": "developer", "content": i}) + messages.append({"role": "user", "content": x}) + messages.append({"role": "assistant", "content": y}) + texts.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)) + return {"text": texts} diff --git a/atomgpt/inverse_models/factories.py b/atomgpt/inverse_models/factories.py new file mode 100644 index 0000000..44a6bf4 --- /dev/null +++ b/atomgpt/inverse_models/factories.py @@ -0,0 +1,149 @@ +# factories.py + +from __future__ import annotations + +from abc import ABC, abstractmethod +from atomgpt.inverse_models.products import LoadedModel +from typing import Callable +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from atomgpt.inverse_models.inverse_models import TrainingPropConfig +from peft import PeftModel +from typing import Dict +from atomgpt.inverse_models.dataset_utils import alpaca_formatting_prompts_func +from atomgpt.inverse_models.dataset_utils import harmony_formatting_prompts_func +from functools import partial +from typing import List + + +class LanguageModelFactory(ABC): + @abstractmethod + def load_for_training(self, config: TrainingPropConfig) -> LoadedModel: + pass + + @abstractmethod + def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -> LoadedModel: + pass + + @abstractmethod + def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable: + pass + + +class AtomGPTFactory(LanguageModelFactory): + def load_for_training(self, config: TrainingPropConfig) -> LoadedModel: + from atomgpt.inverse_models.loader import FastLanguageModel as AtomGPTFastLanguageModel + model, tokenizer = AtomGPTFastLanguageModel.from_pretrained( + model_name=config.model_name, + max_seq_length=config.max_seq_length, + dtype=config.dtype, + load_in_4bit=config.load_in_4bit + ) + if not isinstance(model, PeftModel): + # import sys + print("Not yet a peft model, converting into peft model") + # sys.exit() + model = AtomGPTFastLanguageModel.get_peft_model( + model, + r=config.lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], + lora_alpha=config.lora_alpha, + lora_dropout=0, # Supports any, but = 0 is optimized + bias="none", # Supports any, but = "none" is optimized + use_gradient_checkpointing=True, + random_state=3407, + use_rslora=False, # We support rank stabilized LoRA + loftq_config=None, # And LoftQ + ) + print("Peft model created") + EOS_TOKEN = tokenizer.eos_token + return LoadedModel(model=model, tokenizer=tokenizer) + + def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -> LoadedModel: + model, tokenizer = FastLanguageModel.from_pretrained( + model_name=checkpoint_path, + max_seq_length=config.max_seq_length, + dtype=config.dtype, + load_in_4bit=config.load_in_4bit, + ) + AtomGPTFastLanguageModel.for_inference(model) + return LoadedModel(model=model, tokenizer=tokenizer) + + def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable: + eos = tokenizer.eos_token or "" + return partial(alpaca_formatting_prompts_func, alpaca_prompt=config.alpaca_prompt, eos_token=eos) + + +class GPTOSSFactory(LanguageModelFactory): + def load_for_training(self, config: TrainingPropConfig) -> LoadedModel: + from unsloth import FastLanguageModel as UnslothFastLanguageModel + model, tokenizer = UnslothFastLanguageModel.from_pretrained( + model_name=config.model_name, + max_seq_length=config.max_seq_length, + dtype=config.dtype, + load_in_4bit=config.load_in_4bit, + full_finetuning = False, + ) + if not isinstance(model, PeftModel): + print("Not yet a peft model, converting into peft model") + model = UnslothFastLanguageModel.get_peft_model( + model, + r=config.lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], + lora_alpha=config.lora_alpha, + lora_dropout=0, # Supports any, but = 0 is optimized + bias="none", # Supports any, but = "none" is optimized + use_gradient_checkpointing=True, + random_state=3407, + use_rslora=False, # We support rank stabilized LoRA + loftq_config=None, # And LoftQ + ) + print("Peft model created") + return LoadedModel(model=model, tokenizer=tokenizer) + + def load_for_inference(self, checkpoint_path: str, config: TrainingPropConfig) -> LoadedModel: + model, tokenizer = UnslothFastLanguageModel.from_pretrained( + model_name=checkpoint_path, + max_seq_length=config.max_seq_length, + dtype=config.dtype, + load_in_4bit=config.load_in_4bit, + ) + UnslothFastLanguageModel.for_inference(model) + return LoadedModel(model=model, tokenizer=tokenizer) + + def get_formatting_prompts_func(self, config, model, tokenizer) -> Callable: + return partial(harmony_formatting_prompts_func, tokenizer=tokenizer) + +FACTORY_REGISTRY: Dict[str, type[LanguageModelFactory]] = { + "gemma": AtomGPTFactory, + "qwen": AtomGPTFactory, + "Meta": AtomGPTFactory, + "Llama": AtomGPTFactory, + "llama": AtomGPTFactory, + "Mistral": AtomGPTFactory, + "mistral": AtomGPTFactory, + "gpt-oss": GPTOSSFactory, +} + +def get_lm_factory(config: TrainingPropConfig) -> LanguageModelFactory: + model_name = config.model_name + if "gpt-oss" in model_name: + return GPTOSSFactory() + else: + return AtomGPTFactory() diff --git a/atomgpt/inverse_models/gpt_oss.py b/atomgpt/inverse_models/gpt_oss.py new file mode 100644 index 0000000..3355093 --- /dev/null +++ b/atomgpt/inverse_models/gpt_oss.py @@ -0,0 +1,187 @@ +from atomgpt.inverse_models.llama import * # noqa: F401,F403 +import os + +from atomgpt.inverse_models._utils import __version__ # noqa: F401 +from atomgpt.inverse_models._utils2 import Version, _get_dtype # noqa: F401 + +try: + # New HF GPT-OSS modeling API + from transformers.models.gpt_oss.modeling_gpt_oss import ( + GptOssModel, + GptOssForCausalLM, + ) +except Exception as exc: # pragma: no cover + raise ImportError( + "AtomGPT: transformers installation does not appear to include " + "the `gpt_oss` model. Please upgrade transformers:\n" + ' pip install --upgrade "transformers"\n' + "and ensure you are on a release that supports GPT-OSS." + ) from exc + +# --- AtomGPT: fix GPT-OSS position_ids shape for rotary embeddings --- +# Some fast-generation paths may end up passing a 1D tensor for `position_ids` +# (shape [seq_len]), but GPT-OSS's rotary embeddings expect [batch, seq_len]. +# This wrapper upgrades 1D position_ids → [1, seq_len] to avoid IndexError. + +if not hasattr(GptOssModel, "_atomgpt_position_ids_patched"): + _original_gpt_oss_forward = GptOssModel.forward + + def _atomgpt_gpt_oss_forward(self, *args, **kwargs): + pos = kwargs.get("position_ids", None) + try: + if pos is not None and hasattr(pos, "dim") and pos.dim() == 1: + # [seq_len] -> [1, seq_len] + kwargs["position_ids"] = pos.unsqueeze(0) + except Exception: + # Best-effort: never let our fix be the thing that breaks. + pass + return _original_gpt_oss_forward(self, *args, **kwargs) + + GptOssModel.forward = _atomgpt_gpt_oss_forward + GptOssModel._atomgpt_position_ids_patched = True + + print( + "AtomGPT: Patched GptOssModel.forward to fix 1D position_ids for GPT-OSS rotary embeddings." + ) + + +# Convenience list of all 4 Unsloth GPT-OSS models that are supported via +# FastLanguageModel.from_pretrained(..., model_name=...). +# +# You can use these as drop-in `model_name` values: +# +# from atomgpt.inverse_models.gpt_oss import UNSLOTH_GPT_OSS_MODELS +# model_name = UNSLOTH_GPT_OSS_MODELS[0] +# +UNSLOTH_GPT_OSS_MODELS = [ + # BitsAndBytes 4bit Unsloth quantizations + "unsloth/gpt-oss-20b-unsloth-bnb-4bit", + "unsloth/gpt-oss-120b-unsloth-bnb-4bit", + # MXFP4 “original” weights that Unsloth wraps + "unsloth/gpt-oss-20b", + "unsloth/gpt-oss-120b", +] + + +def _log_once(msg: str) -> None: + """Tiny helper to avoid spamming logs if imported multiple times.""" + if getattr(_log_once, "_seen", None) is None: + _log_once._seen = set() + if msg in _log_once._seen: + return + _log_once._seen.add(msg) + print(msg) + + +class FastGptOssModel(FastLlamaModel): + """ + Fast GPT-OSS integration for AtomGPT. + + This mirrors the overall structure of `FastMistralModel` but takes a more + conservative approach: + + * We **do not** override GPT-OSS attention / MoE internals. Those are + handled by the upstream `transformers` implementation and whatever + `unsloth_compile_transformers` is already doing in your loader. + * We **do**: + - patch PEFT `PeftModelForCausalLM.forward` to the same fast path + that LLaMA / Mistral use. + - (for now) leave `GptOssForCausalLM.prepare_inputs_for_generation` + untouched, because the LLaMA-style patch breaks GPT-OSS attention + shapes during sampling. + * Everything else is delegated to `FastLlamaModel.from_pretrained` with + `model_patcher=FastGptOssModel`, to keep the hierarchy uniform. + """ + + @staticmethod + def pre_patch(): + """ + Apply GPT-OSS-specific patches. + + We deliberately do **not** touch GPT-OSS attention / decoder layer + implementations here, to avoid shape / MoE wiring mistakes. Instead we + reuse only the architecture-agnostic bits from `llama.py`. + """ + # Reuse the PEFT fast forward path (architecture-agnostic: it only + # assumes a CausalLM head with `.lm_head`). + global PeftModelForCausalLM # imported from llama.py via * + PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward + + # IMPORTANT: + # Do NOT call `fix_prepare_inputs_for_generation(GptOssForCausalLM)` + # here. That patch is tailored to LLaMA/Mistral KV-cache semantics and + # causes attention shape mismatches for GPT-OSS (e.g. value_states + # ending up with seq_len = 1 instead of the full context length). + # + # We'll rely on the official transformers implementation of + # `prepare_inputs_for_generation` for GPT-OSS instead. + # fix_prepare_inputs_for_generation(GptOssForCausalLM) + + _log_once( + "AtomGPT: Patched GPT-OSS (PEFT fast forward only; " + "using native prepare_inputs_for_generation)." + ) + return + + + @staticmethod + def from_pretrained( + model_name: str = "unsloth/gpt-oss-20b-unsloth-bnb-4bit", + max_seq_length: int | None = None, + dtype=None, + load_in_4bit: bool = True, + token=None, + device_map: str | dict = "sequential", + rope_scaling=None, # GPT-OSS does not use classic RoPE scaling, kept for API symmetry + fix_tokenizer: bool = True, + model_patcher=None, + tokenizer_name: str | None = None, + trust_remote_code: bool = False, + **kwargs, + ): + """ + Thin wrapper around `FastLlamaModel.from_pretrained`. + + The important part is that we pass `model_patcher=FastGptOssModel`, + which causes: + + * `FastGptOssModel.pre_patch()` to run before loading. + * All the Unsloth / AtomGPT compile + quantization machinery to be + reused exactly as for LLaMA / Mistral. + + Usage (drop-in with your loader): + + from atomgpt.inverse_models.loader import FastLanguageModel + + model, tokenizer = FastLanguageModel.from_pretrained( + model_name="unsloth/gpt-oss-20b-unsloth-bnb-4bit", + max_seq_length=2048, + dtype=None, + load_in_4bit=True, + ) + """ + # Defer to the LLaMA machinery – it will: + # * call FastGptOssModel.pre_patch() + # * run unsloth_compile_transformers + # * handle bitsandbytes / 4bit / 8bit / PEFT, etc. + return FastLlamaModel.from_pretrained( + model_name=model_name, + max_seq_length=max_seq_length, + dtype=dtype, + load_in_4bit=load_in_4bit, + token=token, + device_map=device_map, + rope_scaling=rope_scaling, + fix_tokenizer=fix_tokenizer, + model_patcher=FastGptOssModel if model_patcher is None else model_patcher, + tokenizer_name=tokenizer_name, + trust_remote_code=trust_remote_code, + **kwargs, + ) + + +__all__ = [ + "FastGptOssModel", + "UNSLOTH_GPT_OSS_MODELS", +] + diff --git a/atomgpt/inverse_models/inverse_models.py b/atomgpt/inverse_models/inverse_models.py index 9744286..e343437 100644 --- a/atomgpt/inverse_models/inverse_models.py +++ b/atomgpt/inverse_models/inverse_models.py @@ -1,7 +1,8 @@ from typing import Optional -from atomgpt.inverse_models.loader import FastLanguageModel +from typing import Dict +from typing import Literal +from atomgpt.inverse_models.factories import get_lm_factory -# from unsloth import FastLanguageModel from atomgpt.inverse_models.callbacks import ( PrintGPUUsageCallback, ExampleTrainerCallback, @@ -27,13 +28,15 @@ from jarvis.io.vasp.inputs import Poscar import csv import os +import numpy as np from pydantic_settings import BaseSettings import sys import json import argparse -from typing import Literal import time from jarvis.core.composition import Composition +import traceback +from atomgpt.inverse_models.dataset_utils import make_alpaca_json # from atomgpt.inverse_models.custom_trainer import CustomSFTTrainer @@ -107,95 +110,6 @@ class TrainingPropConfig(BaseSettings): logging_steps: int = 10 -def get_input(config=None, chem="", val=10): - if config.chem_info == "none": - prefix = "" - elif config.chem_info == "element_list": - prefix = ( - "The chemical elements are " - + chem # atoms.composition.search_string - + " . " - ) - elif config.chem_info == "element_dict": - prefix = ( - "The chemical contents are " - + chem # atoms.composition.search_string - + " . " - ) - elif config.chem_info == "formula": - prefix = ( - "The chemical formula is " - + chem # atoms.composition.reduced_formula - + " . " - ) - - inp = ( - prefix - + "The " - + config.prop - + " is " - + str(val) - + "." - + config.output_prompt - ) - return inp - - -def make_alpaca_json( - dataset=[], - jids=[], - # prop="Tc_supercon", - # instruction="", - include_jid=False, - # chem_info="", - # output_prompt="", - config=None, -): - mem = [] - print("config.prop", config.prop) - for i in dataset: - if i[config.prop] != "na" and i[config.id_tag] in jids: - atoms = Atoms.from_dict(i["atoms"]) - info = {} - if include_jid: - info["id"] = i[config.id_tag] - info["instruction"] = config.instruction - if config.chem_info == "none": - chem = "" - elif config.chem_info == "element_list": - chem = atoms.composition.search_string - elif config.chem_info == "element_dict": - comp = Composition.from_string( - atoms.composition.reduced_formula - ) - chem = comp.to_dict() - chem = str(dict(sorted(chem.items()))) - elif config.chem_info == "formula": - chem = atoms.composition.reduced_formula - - inp = get_input(config=config, val=i[config.prop], chem=chem) - info["input"] = inp - - info["output"] = get_crystal_string_t(atoms) - mem.append(info) - return mem - - -def formatting_prompts_func(examples, alpaca_prompt): - instructions = examples["instruction"] - inputs = examples["input"] - outputs = examples["output"] - texts = [] - EOS_TOKEN = "" - for instruction, input, output in zip(instructions, inputs, outputs): - # Must add EOS_TOKEN, otherwise your generation will go on forever! - text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN - texts.append(text) - return { - "text": texts, - } - - def load_model(path="", config=None): if config is None: config_file = os.path.join(path, "config.json") @@ -211,42 +125,112 @@ def load_model(path="", config=None): FastLanguageModel.for_inference(model) return model, tokenizer, config +def _validate_atoms(atoms): + if atoms is None: + return False, "atoms_is_none" + try: + lat = np.asarray(getattr(atoms, "lattice_mat", None), dtype=float) + if lat.shape != (3, 3): + return False, f"bad_lattice_shape:{getattr(atoms,'lattice_mat',None)}" + if not np.isfinite(lat).all(): + return False, "nonfinite_lattice" + n = getattr(atoms, "num_atoms", None) + if n is None or n <= 0: + return False, f"num_atoms_invalid:{n}" + _ = Poscar(atoms).to_string() + return True, "" + except Exception as e: + return False, f"poscar_fail:{type(e).__name__}:{e}" + +def _poscar_one_line(at): + return Poscar(at).to_string().replace("\n", "\\n") + +def _misses_path(csv_out, config): + fname = getattr(config, "miss_csv", None) + if fname is None or not str(fname).strip(): + root, ext = os.path.splitext(csv_out) + fname = root + ".misses.csv" + os.makedirs(os.path.dirname(os.path.abspath(fname)), exist_ok=True) + return fname def evaluate( - test_set=[], model="", tokenizer="", csv_out="out.csv", config="" + test_set=[], + model="", + tokenizer="", + csv_out="out.csv", + config="", ): print("Testing\n", len(test_set)) - f = open(csv_out, "w") - f.write("id,target,prediction\n") + os.makedirs(os.path.dirname(os.path.abspath(csv_out)), exist_ok=True) + miss_csv_out = _misses_path(csv_out, config) + + with open(csv_out, "w", newline="") as f_ok, open(miss_csv_out, "w", newline="") as f_miss: + ok_writer = csv.writer(f_ok) + miss_writer = csv.writer(f_miss) + ok_writer.writerow(["id", "target", "prediction"]) + miss_writer.writerow(["id", "stage", "error", "detail", "raw_text_preview"]) + + for i in tqdm(test_set, total=len(test_set)): + sample_id = i.get("id", "") + target_mat = None + target_err = None + try: + target_mat = text2atoms("\n" + i["output"]) + if os.environ.get("PRINT_STRUCTURES"): + print(f"Target Structure ({sample_id}):") + print(target_mat) + + ok, detail = _validate_atoms(target_mat) + if not ok: + target_err = detail + except Exception as e: + target_err = f"text2atoms:{type(e).__name__}:{e}" + if os.environ.get("PRINT_STRUCTURES"): + print(f"Target Structure ({sample_id}) FAILED: {target_err}") + + if target_err: + miss_writer.writerow([sample_id, "target", "invalid_target", target_err, (i.get("output","")[:240])]) + continue + + gen_mat = None + gen_err = None + raw_response = "" + try: + gen_mat, raw_response = gen_atoms( + prompt=i["input"], + tokenizer=tokenizer, + model=model, + alpaca_prompt=config.alpaca_prompt, + instruction=config.instruction, + ) + if os.environ.get("PRINT_STRUCTURES"): + print(f"Predicted Structure ({sample_id}):") + print(gen_mat) + + ok, detail = _validate_atoms(gen_mat) + if not ok: + gen_err = detail + except Exception as e: + gen_err = f"gen_atoms:{type(e).__name__}:{e}" + if os.environ.get("PRINT_STRUCTURES"): + print(f"Predicted Structure ({sample_id}) FAILED: {gen_err}") + print(traceback.format_exc()) + print(f"Raw LLM Output ({sample_id}):") + print(raw_response) + + if gen_err: + miss_writer.writerow([sample_id, "prediction", "invalid_prediction", gen_err, ""]) + continue + + try: + ok_writer.writerow([ + sample_id, + _poscar_one_line(target_mat), + _poscar_one_line(gen_mat), + ]) + except Exception as e: + miss_writer.writerow([sample_id, "write", "write_failed", f"{type(e).__name__}:{e}", ""]) - for i in tqdm(test_set, total=len(test_set)): - # try: - # prompt = i["input"] - # print("prompt", prompt) - gen_mat = gen_atoms( - prompt=i["input"], - tokenizer=tokenizer, - model=model, - alpaca_prompt=config.alpaca_prompt, - instruction=config.instruction, - ) - target_mat = text2atoms("\n" + i["output"]) - print("target_mat", target_mat) - print("genmat", gen_mat) - line = ( - i["id"] - + "," - + Poscar(target_mat).to_string().replace("\n", "\\n") - + "," - + Poscar(gen_mat).to_string().replace("\n", "\\n") - + "\n" - ) - f.write(line) - # print() - # except Exception as exp: - # print("Error", exp) - # pass - f.close() def batch_evaluate( @@ -335,7 +319,6 @@ def batch_evaluate( def main(config_file=None): if config_file is None: - args = parser.parse_args(sys.argv[1:]) config_file = args.config_name if not torch.cuda.is_available(): @@ -362,17 +345,13 @@ def main(config_file=None): run_path = os.path.dirname(id_prop_path) num_train = config.num_train num_test = config.num_test - # model_name = config.model_name callback_samples = config.callback_samples - # loss_function = config.loss_function - # id_prop_path = os.path.join(run_path, id_prop_path) with open(id_prop_path, "r") as f: reader = csv.reader(f) dt = [row for row in reader] if not num_train: num_test = int(len(dt) * config.test_ratio) num_train = len(dt) - num_test - dat = [] ids = [] for i in tqdm(dt, total=len(dt)): @@ -411,7 +390,7 @@ def main(config_file=None): print("num_train", num_train) print("num_test", num_test) test_ids = ids[num_train : num_train + num_test] - # test_ids = ids[num_train:] + alpaca_prop_train_filename = os.path.join( config.output_dir, "alpaca_prop_train.json" ) @@ -451,42 +430,11 @@ def main(config_file=None): print(alpaca_prop_test_filename, "exists") m_test = loadjson(alpaca_prop_test_filename) - # 4bit pre quantized models we support for 4x faster downloading + no OOMs. - model, tokenizer = FastLanguageModel.from_pretrained( - model_name=config.model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B - max_seq_length=config.max_seq_length, - dtype=config.dtype, - load_in_4bit=config.load_in_4bit, - # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf - ) - if not isinstance(model, PeftModel): - # import sys - print("Not Peft model") - # sys.exit() - model = FastLanguageModel.get_peft_model( - model, - r=config.lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 - target_modules=[ - "q_proj", - "k_proj", - "v_proj", - "o_proj", - "gate_proj", - "up_proj", - "down_proj", - ], - lora_alpha=config.lora_alpha, - lora_dropout=0, # Supports any, but = 0 is optimized - bias="none", # Supports any, but = "none" is optimized - use_gradient_checkpointing=True, - random_state=3407, - use_rslora=False, # We support rank stabilized LoRA - loftq_config=None, # And LoftQ - ) + factory = get_lm_factory(config) + loaded: LoadedModel = factory.load_for_training(config) + model, tokenizer = loaded.model, loaded.tokenizer + formatting_prompts_func = factory.get_formatting_prompts_func(config, model, tokenizer) - EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN - # tokenizer.pad_token_id = tokenizer.eos_token_id - # model.resize_token_embeddings(len(tokenizer)) train_dataset = load_dataset( "json", data_files=alpaca_prop_train_filename, @@ -499,9 +447,6 @@ def main(config_file=None): split="train", # "json", data_files="alpaca_prop_train.json", split="train" ) - formatting_prompts_func_with_prompt = partial( - formatting_prompts_func, alpaca_prompt=config.alpaca_prompt - ) def tokenize_function(example): return tokenizer( @@ -512,12 +457,14 @@ def tokenize_function(example): ) train_dataset = train_dataset.map( - formatting_prompts_func_with_prompt, + formatting_prompts_func, batched=True, + num_proc=config.dataset_num_proc ) eval_dataset = eval_dataset.map( - formatting_prompts_func_with_prompt, + formatting_prompts_func, batched=True, + num_proc=config.dataset_num_proc ) # Compute the actual max sequence length in raw text lengths = [ @@ -527,8 +474,8 @@ def tokenize_function(example): max_seq_length = max(lengths) print(f"🧠 Suggested max_seq_length based on dataset: {max_seq_length}") - tokenized_train = train_dataset.map(tokenize_function, batched=True) - tokenized_eval = eval_dataset.map(tokenize_function, batched=True) + tokenized_train = train_dataset.map(tokenize_function, batched=True, num_proc=config.dataset_num_proc) + tokenized_eval = eval_dataset.map(tokenize_function, batched=True, num_proc=config.dataset_num_proc) tokenized_train.set_format( type="torch", columns=["input_ids", "attention_mask", "output"] ) @@ -575,8 +522,6 @@ def tokenize_function(example): trainer = SFTTrainer( model=model, train_dataset=tokenized_train, - # train_dataset = train_dataset, - # tokenizer = tokenizer, args=SFTConfig( dataset_text_field="text", max_seq_length=config.max_seq_length, @@ -593,49 +538,26 @@ def tokenize_function(example): num_train_epochs=config.num_epochs, save_strategy=config.save_strategy, save_steps=config.save_steps, - ), + disable_tqdm=False, + log_level="info", + ), ) if callback_samples > 0: callback = ExampleTrainerCallback( some_tokenized_dataset=tokenized_eval, - # some_tokenized_dataset=tokenized_eval, tokenizer=tokenizer, max_length=config.max_seq_length, callback_samples=callback_samples, ) trainer.add_callback(callback) + gpu_usage = PrintGPUUsageCallback() trainer.add_callback(gpu_usage) trainer_stats = trainer.train() trainer.save_model(config.model_save_path) - # model.save_pretrained(config.model_save_path) - # model, tokenizer = FastLanguageModel.from_pretrained( - # model_name=config.model_save_path, # YOUR MODEL YOU USED FOR TRAINING - # max_seq_length=config.max_seq_length, - # dtype=config.dtype, - # load_in_4bit=config.load_in_4bit, - # ) model = trainer.model - FastLanguageModel.for_inference(model) # Enable native 2x faster inference - # model, tokenizer, config = load_model(path=config.model_save_path) - # batch_evaluate( - # prompts=[i["input"] for i in m_test], - # model=model, - # tokenizer=tokenizer, - # csv_out=config.csv_out, - # config=config, - # ) - # t1 = time.time() - # batch_evaluate( - # test_set=m_test, - # model=model, - # tokenizer=tokenizer, - # csv_out=config.csv_out, - # config=config, - # ) - # t2 = time.time() - # t1a = time.time() + FastLanguageModel.for_inference(model) evaluate( test_set=m_test, model=model, diff --git a/atomgpt/inverse_models/inverse_predict.py b/atomgpt/inverse_models/inverse_predict.py index 378ca35..63054cf 100644 --- a/atomgpt/inverse_models/inverse_predict.py +++ b/atomgpt/inverse_models/inverse_predict.py @@ -79,13 +79,10 @@ def relax_atoms( calculator = AlignnAtomwiseCalculator(path=default_path(), device="cpu") t1 = time.time() - # if calculator is None: - # return atoms ase_atoms = atoms.ase_converter() ase_atoms.calc = calculator ase_atoms = ExpCellFilter(ase_atoms, constant_volume=constant_volume) - # TODO: Make it work with any other optimizer dyn = FIRE(ase_atoms) dyn.run(fmax=fmax, steps=nsteps) en = ase_atoms.atoms.get_potential_energy() @@ -114,9 +111,6 @@ def predict( load_in_4bit=None, # temp_config["load_in_4bit"] verbose=True, # temp_config["load_in_4bit"] ): - # if not os.path.exists("config_name"): - - # config_name=os.path.join(output_dir,"config.json") print("config_path", config_path) if output_dir is not None: config_name = os.path.join(output_dir, "config.json") @@ -125,7 +119,7 @@ def predict( config_name = os.path.join(parent, "config.json") adapter = os.path.join(output_dir, "adapter_config.json") if os.path.exists(adapter): - model_name = output_dir # temp_config["model_name"] + model_name = output_dir if config_path is not None: config_name = config_path if verbose: @@ -198,21 +192,18 @@ def predict( formula=formula, background_subs=background_subs, ) - # y[y < 0.1] = 0 - y_new_str = y # "\n".join(["{0:.2f}".format(x) for x in y]) + y_new_str = y try: if ".dat" in i: formula = str(_formula.split("/")[-1].split(".dat")[0]) except Exception: pass - # gen_mat = main_spectra(spectra=[[y_new_str,y]],formulas=[formula],model=model,tokenizer=tokenizer,device='cuda')[0] prompt = ( "The chemical formula is " + formula + " The " + temp_config["prop"] + " is " - # + " The XRD is " + y_new_str + ". Generate atomic structure description with lattice lengths, angles, coordinates and atom types." ) @@ -224,13 +215,13 @@ def predict( + " The " + temp_config["prop"] + " is " - # + " The XRD is " + str(prop_val) + ". Generate atomic structure description with lattice lengths, angles, coordinates and atom types." ) if verbose: print("prompt here", prompt.replace("\n", ",")) + gen_mat = gen_atoms( prompt=prompt, model=model, @@ -239,6 +230,18 @@ def predict( instruction=temp_config["instruction"], device=device, ) + + if gen_mat is None: + print( + "The structure returned by gen_mat() is not a valid crystal structure." + ) + info = {} + info["prompt"] = prompt + info["error"] = "Invalid structure returned by AtomGPT (None)." + mem.append(info) + # skip the rest of the loop for this entry + continue + if verbose: print("gen atoms", gen_mat) print("gen atoms spacegroup", gen_mat.spacegroup()) @@ -257,8 +260,6 @@ def predict( if __name__ == "__main__": - # output_dir = make_id_prop() - # output_dir="." args = parser.parse_args(sys.argv[1:]) print("args.config_path", args.config_path) predict( @@ -271,5 +272,4 @@ def predict( config_path=args.config_path, prop_val=args.prop_val, background_subs=args.background_subs, - # config_name=args.config_name, ) diff --git a/atomgpt/inverse_models/llama.py b/atomgpt/inverse_models/llama.py index f647a75..6f29d0a 100644 --- a/atomgpt/inverse_models/llama.py +++ b/atomgpt/inverse_models/llama.py @@ -2012,11 +2012,40 @@ def unsloth_fast_generate( # For newer HF kwargs["cache_implementation"] = "dynamic" - # For num_logits_to_keep - num_logits_to_keep = kwargs.get("num_logits_to_keep", None) - logits_to_keep = kwargs.get("logits_to_keep", None) - if num_logits_to_keep is None and logits_to_keep is None: - kwargs["num_logits_to_keep"] = 1 + + # --- Handle num_logits_to_keep / logits_to_keep safely per model type --- + import inspect + + model_type = getattr(getattr(self, "config", None), "model_type", None) + + # GPT-OSS does *not* advertise these kwargs in its generate/forward stack, and + # passing them causes HF `generate` → `_validate_model_kwargs` to raise: + # ValueError: The following `model_kwargs` are not used by the model: ['num_logits_to_keep'] + # So for GPT-OSS we always strip them. + if model_type == "gpt_oss": + kwargs.pop("num_logits_to_keep", None) + kwargs.pop("logits_to_keep", None) + else: + # For other models (llama, mistral, etc.), keep Unsloth's optimization: + # only use num_logits_to_keep/logits_to_keep if the model forward supports them. + try: + forward_sig = inspect.signature(self.forward) + supports_num_logits = ( + "num_logits_to_keep" in forward_sig.parameters + or "logits_to_keep" in forward_sig.parameters + ) + except (TypeError, ValueError): + supports_num_logits = False + + if supports_num_logits: + num_logits_to_keep = kwargs.get("num_logits_to_keep", None) + logits_to_keep = kwargs.get("logits_to_keep", None) + if num_logits_to_keep is None and logits_to_keep is None: + # Enable Unsloth's memory optimization for compatible models + kwargs["num_logits_to_keep"] = 1 + else: + kwargs.pop("num_logits_to_keep", None) + kwargs.pop("logits_to_keep", None) # Remove token_type_ids kwargs.pop("token_type_ids", None) @@ -3141,6 +3170,13 @@ def patch_peft_model( apply_lora_mlp = apply_lora_mlp_swiglu elif model_type == "qwen3moe": apply_lora_mlp = apply_lora_mlp_swiglu + elif model_type == "gpt_oss": + if use_gradient_checkpointing == "unsloth": + try: + model.gradient_checkpointing_enable() + except Exception: + pass + return model else: raise NotImplementedError( f"AtomGPT: {model_type} is not yet implemented!" diff --git a/atomgpt/inverse_models/loader.py b/atomgpt/inverse_models/loader.py index 2a66eeb..0a501ee 100644 --- a/atomgpt/inverse_models/loader.py +++ b/atomgpt/inverse_models/loader.py @@ -6,6 +6,7 @@ USE_MODELSCOPE, get_transformers_model_type, ) +from atomgpt.inverse_models.gpt_oss import FastGptOssModel from atomgpt.inverse_models.granite import FastGraniteModel from atomgpt.inverse_models.llama import FastLlamaModel, logger from atomgpt.inverse_models.mistral import FastMistralModel @@ -44,6 +45,7 @@ SUPPORTS_GRANITE = transformers_version >= Version("4.46.0") SUPPORTS_QWEN3 = transformers_version >= Version("4.50.3") SUPPORTS_QWEN3_MOE = transformers_version >= Version("4.50.3") +SUPPORTS_GPT_OSS = transformers_version >= Version("4.55.0") if SUPPORTS_GEMMA: from atomgpt.inverse_models.gemma import FastGemmaModel if SUPPORTS_GEMMA2: @@ -294,6 +296,17 @@ def from_pretrained( f"to obtain the latest transformers build, then restart this session." ) dispatch_model = FastGemmaModel + elif model_type == "gpt_oss": + if not SUPPORTS_GPT_OSS: + raise ImportError( + f"AtomGPT: Your transformers version of {transformers_version} " + f"does not support GPT-OSS.\n" + f"The minimum required version is 4.55.0.\n" + f'Try `pip install --upgrade "transformers>=4.55.0"`\n' + f"to obtain the latest compatible transformers build, then " + f"restart this session." + ) + dispatch_model = FastGptOssModel elif model_type == "gemma2": if not SUPPORTS_GEMMA2: raise ImportError( @@ -448,12 +461,12 @@ def from_pretrained( pass if load_in_4bit: - # Fix up bitsandbytes config + # Fix up bitsandbytes config, robust to missing torch_dtype. + # Use the same helper we use elsewhere. + compute_dtype = _get_dtype(dtype) # falls back to bf16/fp16 based on hardware + quantization_config = { - # Sometimes torch_dtype is not a string!! - "bnb_4bit_compute_dtype": model.config.to_dict()[ - "torch_dtype" - ], + "bnb_4bit_compute_dtype": compute_dtype, "bnb_4bit_quant_type": "nf4", "bnb_4bit_use_double_quant": True, "llm_int8_enable_fp32_cpu_offload": False, @@ -465,6 +478,7 @@ def from_pretrained( "quant_method": "bitsandbytes", } model.config.update({"quantization_config": quantization_config}) + pass if is_peft: @@ -905,14 +919,27 @@ def from_pretrained( ] ) pass - + if load_in_4bit: - # Fix up bitsandbytes config + # Fix up bitsandbytes config, robust to missing torch_dtype/dtype. + cfg_dict = model.config.to_dict() + + compute_dtype = cfg_dict.get("torch_dtype", None) + if compute_dtype is None: + # Newer configs may use "dtype" instead + compute_dtype = cfg_dict.get("dtype", None) + + # Fall back to the user-specified dtype or a sensible default + if compute_dtype is None: + compute_dtype = _get_dtype(dtype) # imported above + + # Sometimes this is a string like "float16" – map to torch dtype + import torch + if isinstance(compute_dtype, str): + compute_dtype = getattr(torch, compute_dtype, torch.float16) + quantization_config = { - # Sometimes torch_dtype is not a string!! - "bnb_4bit_compute_dtype": model.config.to_dict()[ - "torch_dtype" - ], + "bnb_4bit_compute_dtype": compute_dtype, "bnb_4bit_quant_type": "nf4", "bnb_4bit_use_double_quant": True, "llm_int8_enable_fp32_cpu_offload": False, diff --git a/atomgpt/inverse_models/products.py b/atomgpt/inverse_models/products.py new file mode 100644 index 0000000..6004f81 --- /dev/null +++ b/atomgpt/inverse_models/products.py @@ -0,0 +1,16 @@ +# products.py + +from dataclasses import dataclass +from typing import Protocol, Any, Callable +import torch +from transformers import PreTrainedTokenizerBase + +@dataclass +class LoadedModel: + model: torch.nn.Module + tokenizer: PreTrainedTokenizerBase + + +class DatasetFormattingFunction(Protocol): + def get_formatting_prompts_func() -> Callable: + pass diff --git a/atomgpt/inverse_models/utils.py b/atomgpt/inverse_models/utils.py index b5d441f..6395b24 100644 --- a/atomgpt/inverse_models/utils.py +++ b/atomgpt/inverse_models/utils.py @@ -175,7 +175,7 @@ def gen_atoms( print(exp) pass - return atoms + return atoms, response def get_crystal_string_t(atoms): @@ -381,7 +381,7 @@ def main_spectra( + " Generate atomic structure description with lattice lengths, angles, coordinates and atom types." ) # print(info) - atoms = gen_atoms( + atoms, _ = gen_atoms( prompt=info["input"], model=model, alpaca_prompt=alpaca_prompt, diff --git a/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra2.py b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra2.py new file mode 100644 index 0000000..4751bf1 --- /dev/null +++ b/atomgpt/scripts/ramangpt/dataset_atomgpt_spectra2.py @@ -0,0 +1,137 @@ +# This is an updated version of diffractgpt using find_peaks algorithm +import numpy as np +from scipy.signal import find_peaks +from jarvis.analysis.diffraction.xrd import XRD +from jarvis.core.atoms import Atoms +from jarvis.db.figshare import data +from tqdm import tqdm +from jarvis.db.jsonutils import dumpjson +def get_crystal_string_t(atoms): + lengths = atoms.lattice.abc # structure.lattice.parameters[:3] + angles = atoms.lattice.angles + atom_ids = atoms.elements + frac_coords = atoms.frac_coords + + crystal_str = ( + " ".join(["{0:.2f}".format(x) for x in lengths]) + + "\n" + + " ".join([str(int(x)) for x in angles]) + + "\n" + + "\n".join( + [ + str(t) + " " + " ".join(["{0:.3f}".format(x) for x in c]) + for t, c in zip(atom_ids, frac_coords) + ] + ) + ) + + # crystal_str = atoms_describer(atoms) + "\n*\n" + crystal_str + return crystal_str + + +def gaussian_recast(x_original=[], y_original=[], x_new=[], sigma=.1): + y_new = np.zeros_like(x_new, dtype=np.float64) + for x0, amp in zip(x_original, y_original): + y_new += amp * np.exp(-0.5 * ((x_new - x0) / sigma) ** 2) + x_new=np.array(x_new) + y_new=np.array(y_new) + return x_new, y_new + +def make_diffractgpt_prompt(atoms, jid='na',thetas=[0, 90], num_peaks=20): + """Reads 2θ–intensity data, extracts top N peaks, and builds a prompt for DiffractGPT.""" + two_theta, d, intensity = XRD(thetas=thetas).simulate(atoms=atoms) + intensity = np.array(intensity) + intensity /= intensity.max() + + two_theta = np.array(two_theta) + x_new = np.arange(0, 90, .1) + two_theta,intensity = gaussian_recast(x_original=two_theta,y_original=intensity,x_new=x_new) + #print("two_theta",two_theta) + #print("intensity",intensity) + intensity /= intensity.max() + #print(intensity,max(intensity),len(intensity)) + + # Find all peaks (with minimal height threshold to ignore noise) + peaks, props = find_peaks(intensity, height=0.01, distance=1,prominence=0.05) + #print("peaks",peaks) + # Get top N peaks by intensity + top_indices = np.argsort(props['peak_heights'])[::-1][:num_peaks] + top_peaks = peaks[top_indices] + top_peaks_sorted = top_peaks[np.argsort(two_theta[top_peaks])] + + # Create list of (2θ, intensity) + peak_list = [(round(two_theta[p], 2), round(intensity[p], 2)) for p in top_peaks_sorted] + + # Build DiffractGPT prompt + peak_text = ", ".join([f"{t}°({i})" for t, i in peak_list]) + print(jid,peak_text) + num_peaks = len(peaks) + formula=atoms.composition.reduced_formula + input = ( + f"The chemical formula is: {formula}.\n" + f"The XRD pattern shows main peaks at: {peak_text}.\n" + f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types." + ) + output= get_crystal_string_t(atoms) + info={} + info["instruction"]= "Below is a description of a material." + info["input"]=input + info['id']=jid + info['peak_text']=peak_text + info["output"]=output + return info + +# Example usage +if __name__ == "__main__": + #atoms = Atoms.from_poscar('POSCAR') + jids=["JVASP-32","JVASP-15014","JVASP-1002","JVASP-107","JVASP-17957","JVASP-1151"] + f=open('id_prop.csv','w') + dat=data('dft_3d') #[0:num_samples] + test=[] + train=[] + for i in tqdm(dat,total=len(dat)): + if i['jid'] in jids: + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + filename='POSCAR-'+i['jid'] + atoms.write_poscar(filename) + line=filename+','+prompt['peak_text']+'\n' + f.write(line) + + + + #print(i['jid'],prompt) + train.append(prompt) + if i['jid']=="JVASP-32": + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + #print(i['jid'],prompt) + test.append(prompt) + train.append(prompt) + if i['jid']=="JVASP-15014": + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + #print(i['jid'],prompt) + test.append(prompt) + train.append(prompt) + if i['jid']=="JVASP-1002": + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + #print(i['jid'],prompt) + test.append(prompt) + train.append(prompt) + if i['jid']=="JVASP-107": + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + #print(i['jid'],prompt) + test.append(prompt) + train.append(prompt) + if i['jid']=="JVASP-17957": + atoms=Atoms.from_dict(i['atoms']) + prompt = make_diffractgpt_prompt(atoms, jid=i['jid'],num_peaks=20) + #print(i['jid'],prompt) + test.append(prompt) + train.append(prompt) + f.close() + dumpjson(data=train,filename='alpaca_prop_train.json') + dumpjson(data=test,filename='alpaca_prop_test.json') diff --git a/atomgpt/scripts/ramangpt/make_raman_alpaca.py b/atomgpt/scripts/ramangpt/make_raman_alpaca.py new file mode 100644 index 0000000..8cfd65b --- /dev/null +++ b/atomgpt/scripts/ramangpt/make_raman_alpaca.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +# make_raman_alpaca.py +# +# Read Raman JSON, optionally Niggli-reduce cells, keep only modes whose activity +# is non-zero AFTER rounding to the requested precision, optionally normalize +# frequencies to [0,1] (1.0 = max kept freq), format values, and write +# Alpaca-style train/test JSONs. + +import argparse +import json +import random +from pathlib import Path + +import numpy as np +from tqdm import tqdm +from jarvis.core.atoms import Atoms +from jarvis.db.jsonutils import dumpjson + + +def get_crystal_string_t(atoms: Atoms) -> str: + # Lattice + lengths = np.array(atoms.lattice.abc, dtype=float).ravel() + angles = np.array(atoms.lattice.angles, dtype=float).ravel() + + # Per-site species and fractional coordinates; force shape (N, 3) + atom_ids = [str(x) for x in list(atoms.elements)] + frac = np.asarray(atoms.frac_coords, dtype=float) + if frac.ndim == 1: + if frac.size == 3: + frac = frac.reshape(1, 3) + else: + raise ValueError(f"Unexpected fractional coord shape: {frac.shape}") + elif frac.ndim == 2 and frac.shape[1] != 3: + raise ValueError(f"Expected frac coords with 3 columns, got {frac.shape}") + + # If species length doesn't match coords, broadcast a single species tag + if len(atom_ids) != len(frac): + if len(atom_ids) == 1 and len(frac) > 1: + atom_ids = atom_ids * len(frac) + else: + raise ValueError( + f"Elements length ({len(atom_ids)}) != coords length ({len(frac)})" + ) + + lengths_str = " ".join(f"{x:.2f}" for x in lengths.tolist()) + angles_str = " ".join(f"{x:.2f}" for x in angles.tolist()) + coords_str = "\n".join( + f"{t} " + " ".join(f"{c:.3f}" for c in row.tolist()) + for t, row in zip(atom_ids, frac) + ) + return f"{lengths_str}\n{angles_str}\n{coords_str}" + + +def niggli_reduce_atoms(atoms: Atoms) -> Atoms: + """ + Try to Niggli-reduce using pymatgen (preferred). + Falls back to returning the original atoms if reduction fails or pymatgen is absent. + """ + try: + from pymatgen.core import Structure, Lattice # lazy import + species = list(atoms.elements) # per-site symbols + frac = np.array(atoms.frac_coords, dtype=float) + lat = np.array(atoms.lattice.matrix, dtype=float) + pmg = Structure(Lattice(lat), species, frac, coords_are_cartesian=False) + + # Niggli reduction on the full structure (updates lattice + fractional coords) + reduced, _ = pmg.get_reduced_structure(reduction_algo="niggli") + return Atoms( + lattice_mat=np.array(reduced.lattice.matrix), + coords=np.array(reduced.frac_coords), + elements=[str(s) for s in reduced.species], + cartesian=False, + ) + except Exception: + # Best-effort fallback: return original if anything goes wrong + return atoms + + +def format_fixed_decimals(val: float, decimals: int = 6) -> str: + """Format a number with fixed decimal places (handles scientific-notation inputs).""" + try: + v = float(val) + except Exception: + return "0" + if not np.isfinite(v): + return "0" + return f"{v:.{decimals}f}" + + +def make_raman_record( + entry: dict, + freq_decimals: int = 2, + activity_decimals: int = 6, + normalize_freq: bool = False, + niggli: bool = False, + include_max_freq: bool = False, +) -> dict | None: + atoms_dict = entry.get("atoms") + if not atoms_dict: + return None + + try: + atoms = Atoms.from_dict(atoms_dict) + except Exception: + return None + + # Optional Niggli reduction BEFORE anything else + if niggli: + atoms = niggli_reduce_atoms(atoms) + + try: + formula = atoms.composition.reduced_formula + except Exception: + formula = "Unknown" + + # Coerce to float; handles numbers or strings like "7.88E-7" + freqs = np.array(entry.get("freq_cm", []), dtype=float) + acts = np.array(entry.get("raman_activity", []), dtype=float) + + if freqs.size == 0 or acts.size == 0 or freqs.size != acts.size: + return None + + # Drop non-finite, then exclude anything that *appears* as 0.00... after rounding + acts = np.where(np.isfinite(acts), acts, 0.0) + acts_rounded = np.round(acts, decimals=activity_decimals) + keep_mask = acts_rounded != 0.0 # also drops "-0.0" + + if not np.any(keep_mask): + return None + + freqs_kept = freqs[keep_mask] + acts_rounded_kept = acts_rounded[keep_mask] + + # Optional normalize frequencies to [0,1], with 1.0 = max kept frequency + max_f = float(np.max(freqs_kept)) if freqs_kept.size else 0.0 + if normalize_freq: + if max_f > 0.0: + freqs_display = freqs_kept / max_f # zero maps to 0.0, max -> 1.0 + else: + freqs_display = np.zeros_like(freqs_kept) + freq_unit_caption = "normalized frequency 0–1" + else: + freqs_display = freqs_kept + freq_unit_caption = "cm^-1" + + # Sort by *display* frequency so ordering matches what we print + order = np.argsort(freqs_display) + freqs_display = freqs_display[order] + freqs_kept = freqs_kept[order] # keep original too, in case needed later + acts_rounded_kept = acts_rounded_kept[order] + + # Format output strings + fmt_f = f"{{0:.{freq_decimals}f}}" + pairs = [ + f"{fmt_f.format(float(fd))} ({format_fixed_decimals(float(act_r), activity_decimals)})" + for fd, act_r in zip(freqs_display, acts_rounded_kept) + ] + raman_text = ", ".join(pairs) + + # Build prompt text + extra_norm_line = "" + if normalize_freq and include_max_freq and max_f > 0.0: + extra_norm_line = ( + f"\nNormalization reference: 1.00 corresponds to " + f"{fmt_f.format(max_f)} cm^-1." + ) + + input_header = ( + f"The chemical formula is: {formula}.\n" + f"The Raman spectrum shows active modes in {freq_unit_caption} " + f"with normalized intensities () at: {raman_text}.{extra_norm_line}\n" + f"Generate atomic structure description with lattice lengths, angles, coordinates and atom types." + ) + + rec = { + "instruction": "Below is a description of a material.", + "input": input_header, + "output": get_crystal_string_t(atoms), + "id": entry.get("id", "na"), + "raman_text": raman_text, + } + if normalize_freq and include_max_freq: + rec["max_freq_cm"] = float(max_f) + return rec + + +def main(): + p = argparse.ArgumentParser( + description="Build Alpaca train/test JSONs from a Raman spectroscopy dataset." + ) + p.add_argument("--raman-json", type=Path, required=True, + help="Path to Raman JSON file (list of entries).") + p.add_argument("--test-ratio", type=float, default=0.1, + help="Fraction for test split (default: 0.10).") + p.add_argument("--seed", type=int, default=42, + help="Random seed for the split (default: 42).") + p.add_argument("--train-out", type=Path, default=Path("alpaca_prop_train.json"), + help="Output path for train JSON.") + p.add_argument("--test-out", type=Path, default=Path("alpaca_prop_test.json"), + help="Output path for test JSON.") + p.add_argument("--freq-decimals", type=int, default=2, + help="Decimals for frequencies (cm^-1 or normalized), default: 2.") + p.add_argument("--activity-decimals", type=int, default=6, + help="Decimals for Raman activities (default: 6).") + p.add_argument("--normalize-freq", action="store_true", + help="Normalize frequencies to [0,1]; 1.0 = max kept frequency after intensity rounding.") + p.add_argument("--include-max-freq", action="store_true", + help="When used with --normalize-freq, include the unnormalized max frequency (that maps to 1.0) in the prompt.") + p.add_argument("--niggli-reduce", action="store_true", + help="Apply Niggli reduction to each cell before partitioning into train/test.") + args = p.parse_args() + + with args.raman_json.open("r", encoding="utf-8") as f: + raw = json.load(f) + + records = [] + for entry in tqdm(raw, total=len(raw), desc="Processing Raman entries"): + rec = make_raman_record( + entry, + freq_decimals=args.freq_decimals, + activity_decimals=args.activity_decimals, + normalize_freq=args.normalize_freq, + niggli=args.niggli_reduce, + include_max_freq=args.include_max_freq, + ) + if rec is not None: + records.append(rec) + + if not records: + raise SystemExit("No valid records with nonzero Raman activity (after rounding) were found.") + + # Shuffle & split AFTER optional Niggli reduction (as requested) + rng = random.Random(args.seed) + rng.shuffle(records) + n_total = len(records) + n_test = max(1, int(round(args.test_ratio * n_total))) + test = records[:n_test] + train = records[n_test:] + + dumpjson(data=train, filename=str(args.train_out)) + dumpjson(data=test, filename=str(args.test_out)) + + print(f"Wrote {len(train)} train records → {args.train_out}") + print(f"Wrote {len(test)} test records → {args.test_out}") + + # Quick compatibility check for the evaluator (needs id/input/output) + ex = test[0] + for k in ("id", "instruction", "input", "output"): + if k not in ex: + print(f"WARNING: key '{k}' missing from test example!") + + +if __name__ == "__main__": + main() + diff --git a/atomgpt/scripts/ramangpt/runner.sh b/atomgpt/scripts/ramangpt/runner.sh new file mode 100644 index 0000000..bc4e309 --- /dev/null +++ b/atomgpt/scripts/ramangpt/runner.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +python make_raman_alpaca.py \ + --raman-json ramandb.json \ + --test-ratio 0.1 \ + --train-out alpaca_prop_train.json \ + --test-out alpaca_prop_test.json \ + --freq-decimals 9 \ + --activity-decimals 6 + diff --git a/requirements.txt b/requirements.txt index 33985ee..af7c05c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -98,13 +98,13 @@ SQLAlchemy==2.0.43 sympy==1.14.0 threadpoolctl==3.6.0 tifffile==2025.5.10 -tokenizers==0.21.1 +tokenizers>=0.22.0 tomli==2.2.1 toolz==1.0.0 torch==2.7.0 torchvision==0.22.0 tqdm==4.67.1 -transformers==4.51.3 +transformers==4.57.1 triton==3.3.0 trl==0.15.2 typeguard==4.4.2 @@ -112,6 +112,7 @@ typing-inspection==0.4.1 typing_extensions==4.13.2 tyro==0.9.21 tzdata==2025.2 +unsloth>=2024.10,<2025.3 urllib3==2.4.0 uv==0.7.8 xformers==0.0.30