diff --git a/README.md b/README.md index a6f9d59..881ec45 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,9 @@ While the project was focused on ACA ciphers at first, a later extension added t Users that are experienced in machine learning can use the tools provided in this project to train and evaluate ML models using the `train.py` and `eval.py` scripts. For further information see the following sections *Training* and *Evaluation*. +The initial models were trained using the *Keras* and *scikit-learn* libraries. In 2025 Stefano Sala converted the code for the FFNN and LSTM machine learning architectures from Keras to *PyTorch* as part of his [Bachelor thesis](https://www.cryptool.org/media/publications/theses/BA_Stefano-Sala.pdf). One of the goals of this conversion was a more flexible code architecture for the definition and training of the machine learning models. +With these changes, the FFNN and LSTM machine learning architectures can only be trained with PyTorch, whereas the evaluation still supports both types of model files. + # License This software and the online version on https://www.cryptool.org/cto/ncid are licensed with the GPLv3 license. Private use of this software is allowed. Software using parts of the code from this repository must not be commercially used and also must be GPLv3 licensed. @@ -53,7 +56,7 @@ python3 train.py --help ``` - ``` - python3 train.py --architecture=FFNN --dataset_workers=50 --train_dataset_size=64960 --batch_size=512 --max_iter=1000000000 --min_train_len=100 --max_train_len=100 --min_test_len=100 --max_test_len=100 --model_name=t30.h5 > weights/t30.txt 2> weights/err_t30.txt & + python3 train.py --architecture=FFNN --dataset_workers=50 --train_dataset_size=64960 --batch_size=512 --max_iter=1000000000 --min_train_len=100 --max_train_len=100 --min_test_len=100 --max_test_len=100 --model_name=t30.pth > weights/t30.txt 2> weights/err_t30.txt & ``` @@ -230,11 +233,15 @@ between the rotor ciphers. This helps with the results since the original models [Histocrypt 2021: A Massive Machine-Learning Approach For Classical Cipher Type Detection Using Feature Engineering](https://doi.org/10.3384/ecp183) -AusDM 2021: Detection of Classical Cipher Types with Feature-Learning Approaches +AusDM 2021: Detection of Classical Cipher Types with Feature-Learning Approaches: - [Proceedings](https://link.springer.com/book/10.1007/978-981-16-8531-6) - [Pre-Print](https://www.cryptool.org/download/ncid/Detect-Classical-Cipher-Types-with-Feature-Learning_AusDM2021_PrePrint.pdf) +PyTorch conversion of FFNN and LSTM machine learning architectures: + +[Application of AI for ciphertext identification](https://www.cryptool.org/media/publications/theses/BA_Stefano-Sala.pdf) + ## BibTeX Citation If you use ncid in a scientific publication, we would appreciate using the following citations: diff --git a/cipherTypeDetection/config.py b/cipherTypeDetection/config.py index 5a3ea4e..70b77ab 100755 --- a/cipherTypeDetection/config.py +++ b/cipherTypeDetection/config.py @@ -1,3 +1,5 @@ +from enum import Enum + from cipherImplementations.cipher import INPUT_ALPHABET, UNKNOWN_SYMBOL, UNKNOWN_SYMBOL_NUMBER from cipherImplementations.simpleSubstitution import SimpleSubstitution from cipherImplementations.hill import Hill @@ -192,3 +194,9 @@ # LearningRateSchedulers decay = 1e-8 drop = 0.1 + +class Backend(Enum): + """Differentiate between the Keras and PyTorch backend for model training.""" + KERAS = 0 + PYTORCH = 1 + diff --git a/cipherTypeDetection/ensembleModel.py b/cipherTypeDetection/ensembleModel.py index 990f5fb..5ef1ef4 100644 --- a/cipherTypeDetection/ensembleModel.py +++ b/cipherTypeDetection/ensembleModel.py @@ -1,4 +1,5 @@ import tensorflow as tf +import torch import pickle import numpy as np from tensorflow.keras.optimizers import Adam @@ -7,6 +8,9 @@ import cipherTypeDetection.config as config from cipherTypeDetection.transformer import MultiHeadSelfAttention, TransformerBlock, TokenAndPositionEmbedding from cipherImplementations.cipher import OUTPUT_ALPHABET +from cipherTypeDetection.config import Backend +from cipherTypeDetection.models.ffnn import FFNN +from cipherTypeDetection.models.lstm import LSTM from util.utils import get_model_input_length @@ -37,9 +41,14 @@ mcc_nb = 0.5294535259111087 # Cohen's Kappa is not used as these values are almost the same like MCC. +class ModelMetadata: + def __init__(self, path, architecture, backend): + self.path = path + self.architecture = architecture + self.backend = backend class EnsembleModel: - def __init__(self, models, architectures, strategy, cipher_indices): + def __init__(self, model_metadata, strategy, cipher_indices): self.statistics_dict = { "FFNN": [f1_ffnn, accuracy_ffnn, recall_ffnn, precision_ffnn, mcc_ffnn], "Transformer": [f1_transformer, accuracy_transformer, recall_transformer, precision_transformer, mcc_transformer], @@ -47,10 +56,10 @@ def __init__(self, models, architectures, strategy, cipher_indices): "RF": [f1_rf, accuracy_rf, recall_rf, precision_rf, mcc_rf], "NB": [f1_nb, accuracy_nb, recall_nb, precision_nb, mcc_nb] } - self.models = models - self.architectures = architectures + self.model_metadata = model_metadata + self.models = [None] * len(self.model_metadata) self.strategy = strategy - if isinstance(models[0], str): + if isinstance(model_metadata[0].path, str): self.load_model() for key in self.statistics_dict: statistics = self.statistics_dict[key] @@ -72,22 +81,53 @@ def __init__(self, models, architectures, strategy, cipher_indices): self.total_votes[i] += network_total_votes[i] def load_model(self): - for j in range(len(self.models)): - if self.architectures[j] in ("FFNN", "CNN", "LSTM", "Transformer"): - if self.architectures[j] == 'Transformer': - model_ = tf.keras.models.load_model(self.models[j], custom_objects={ - 'TokenAndPositionEmbedding': TokenAndPositionEmbedding, 'MultiHeadSelfAttention': MultiHeadSelfAttention, - 'TransformerBlock': TransformerBlock}) - else: - model_ = tf.keras.models.load_model(self.models[j]) - optimizer = Adam(learning_rate=config.learning_rate, beta_1=config.beta_1, beta_2=config.beta_2, epsilon=config.epsilon, - amsgrad=config.amsgrad) - model_.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", - metrics=["accuracy", SparseTopKCategoricalAccuracy(k=3, name="k3_accuracy")]) - self.models[j] = model_ + for i, metadata in enumerate(self.model_metadata): + if metadata.backend == Backend.PYTORCH: + self.models[i] = self._load_pytorch(metadata.architecture, metadata.path) + elif metadata.architecture in ("FFNN", "CNN", "LSTM", "Transformer"): + self.models[i] = self._load_keras(metadata.architecture, metadata.path) else: - with open(self.models[j], "rb") as f: - self.models[j] = pickle.load(f) + with open(metadata.path, "rb") as f: + self.models[i] = pickle.load(f) + + def _load_pytorch(self, architecture, path): + checkpoint = torch.load(path, map_location=torch.device("cpu")) + + if architecture == "FFNN": + model = FFNN( + input_size=checkpoint['input_size'], + hidden_size=checkpoint['hidden_size'], + output_size=checkpoint['output_size'], + num_hidden_layers=checkpoint['num_hidden_layers'] + ) + elif architecture == "LSTM": + model = LSTM( + vocab_size=checkpoint['vocab_size'], + embed_dim=checkpoint['embed_dim'], + hidden_size=checkpoint['hidden_size'], + output_size=checkpoint['output_size'], + num_layers=checkpoint['num_layers'], + dropout=checkpoint['dropout'] + ) + else: + raise ValueError(f"Unimplemented PyTorch architecutre: {architecture}") + + model.load_state_dict(checkpoint['model_state_dict']) + model.eval() + return model + + def _load_keras(self, architecture, path): + if architecture == 'Transformer': + model = tf.keras.models.load_model(path, custom_objects={ + 'TokenAndPositionEmbedding': TokenAndPositionEmbedding, 'MultiHeadSelfAttention': MultiHeadSelfAttention, + 'TransformerBlock': TransformerBlock}) + else: + model = tf.keras.models.load_model(path) + optimizer = Adam(learning_rate=config.learning_rate, beta_1=config.beta_1, beta_2=config.beta_2, epsilon=config.epsilon, + amsgrad=config.amsgrad) + model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", + metrics=["accuracy", SparseTopKCategoricalAccuracy(k=3, name="k3_accuracy")]) + return model def evaluate(self, batch, batch_ciphertexts, labels, batch_size, metrics, verbose=0): correct_all = 0 @@ -111,9 +151,14 @@ def evaluate(self, batch, batch_ciphertexts, labels, batch_size, metrics, verbos def predict(self, statistics, ciphertexts, batch_size, verbose=0): predictions = [] - for index, model in enumerate(self.models): - architecture = self.architectures[index] - if architecture == "FFNN": + for index, metadata in enumerate(self.model_metadata): + model = self.models[index] + architecture = metadata.architecture + if metadata.backend == Backend.PYTORCH: + if isinstance(statistics, tf.Tensor): + np_statistics = statistics.numpy() + predictions.append(model.predict(np_statistics, batch_size)) + elif architecture == "FFNN": predictions.append(model.predict(statistics, batch_size=batch_size, verbose=verbose)) elif architecture in ("CNN", "LSTM", "Transformer"): input_length = get_model_input_length(model, architecture) @@ -168,7 +213,7 @@ def predict(self, statistics, ciphertexts, batch_size, verbose=0): scaled[i][j] = scaled[i][j] / len(predictions) elif self.strategy == 'weighted': for i in range(len(predictions)): - statistics = self.statistics_dict[self.architectures[i]] + statistics = self.statistics_dict[self.model_metadata[i].architecture] for j in range(len(predictions[i])): for k in range(len(predictions[i][j])): scaled[j][k] += predictions[i][j][k] * statistics[-1][k] / self.total_votes[k] diff --git a/cipherTypeDetection/eval.py b/cipherTypeDetection/eval.py index 9381e96..7b09228 100755 --- a/cipherTypeDetection/eval.py +++ b/cipherTypeDetection/eval.py @@ -7,7 +7,12 @@ import pickle import functools import numpy as np +import time from datetime import datetime +from enum import Enum + +import torch +import torch.nn.functional as F # This environ variable must be set before all tensorflow imports! os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' @@ -22,10 +27,13 @@ from cipherTypeDetection.cipherStatisticsDataset import CipherStatisticsDataset, PlaintextPathsDatasetParameters, RotorCiphertextsDatasetParameters, calculate_statistics, pad_sequences from cipherTypeDetection.predictionPerformanceMetrics import PredictionPerformanceMetrics from cipherTypeDetection.rotorDifferentiationEnsemble import RotorDifferentiationEnsemble -from cipherTypeDetection.ensembleModel import EnsembleModel +from cipherTypeDetection.ensembleModel import EnsembleModel, ModelMetadata from cipherTypeDetection.transformer import MultiHeadSelfAttention, TransformerBlock, TokenAndPositionEmbedding from util.utils import get_model_input_length from cipherImplementations.cipher import OUTPUT_ALPHABET, UNKNOWN_SYMBOL_NUMBER +from cipherTypeDetection.config import Backend +from cipherTypeDetection.models.ffnn import FFNN +from cipherTypeDetection.models.lstm import LSTM tf.debugging.set_log_device_placement(enabled=False) # always flush after print as some architectures like RF need very long time before printing anything. print = functools.partial(print, flush=True) @@ -36,8 +44,23 @@ def str2bool(v): return v.lower() in ("yes", "true", "t", "1") +def evaluate_torch(model, inputs, labels, batch_size): + with torch.no_grad(): + outputs = model.predict(inputs, batch_size) + y = torch.tensor(labels.numpy(), dtype=torch.long) + + loss = F.cross_entropy(outputs, y) + top1 = torch.argmax(outputs, dim=1) + acc = (top1 == y).float().mean() + + # Calc top-3 + top3 = torch.topk(outputs, k=3, dim=1).indices + y_expanded = y.unsqueeze(1).expand_as(top3) + k3_acc = (top3 == y_expanded).any(dim=1).float().mean() -def benchmark(args, model, architecture): + return loss.item(), acc.item(), k3_acc.item() + +def benchmark(args, model, architecture, backend): cipher_types = args.ciphers args.plaintext_folder = os.path.abspath(args.plaintext_folder) if args.dataset_size * args.dataset_workers > args.max_iter: @@ -120,21 +143,26 @@ def find_ciphertext_paths_in_dir(folder_path): print("Datasets loaded.\n") print('Evaluating model...') - import time + start_time = time.time() iteration = 0 epoch = 0 results = [] prediction_metrics = PredictionPerformanceMetrics(model_name=architecture) + while dataset.iteration < args.max_iter: batches = next(dataset) for index, batch in enumerate(batches): statistics, labels, ciphertexts = batch.items() - if architecture == "FFNN": + if architecture == "FFNN" and backend == Backend.KERAS: results.append(model.evaluate(statistics, labels, batch_size=args.batch_size, verbose=1)) - if architecture in ("CNN", "LSTM", "Transformer"): + elif architecture == "FFNN" and backend == Backend.PYTORCH: + results.append(evaluate_torch(model, statistics, labels, batch_size=args.batch_size)) + elif architecture == "LSTM" and backend == Backend.PYTORCH: + results.append(evaluate_torch(model, ciphertexts, labels, batch_size=args.batch_size)) + elif architecture in ("CNN", "Transformer"): results.append(model.evaluate(ciphertexts, labels, batch_size=args.batch_size, verbose=1)) elif architecture in ("DT", "NB", "RF", "ET", "SVM", "kNN"): results.append(model.score(statistics, labels)) @@ -194,7 +222,7 @@ def evaluate(args, model, architecture): if iterations > args.max_iter: break path = os.path.join(args.data_folder, name) - if os.path.isfile(path): + if os.path.isfile(path) and path.endswith(".txt"): if iterations > args.max_iter: break batch = [] @@ -303,7 +331,7 @@ def evaluate(args, model, architecture): print("\n\nAverage evaluation results from %d iterations: avg_test_acc=%f" % (iterations, avg_test_acc)) -def predict_single_line(args, model, architecture): +def predict_single_line(args, model, architecture, backend): cipher_id_result = '' ciphertexts = [] result = [] @@ -338,8 +366,12 @@ def predict_single_line(args, model, architecture): print("\n") continue results = None - if architecture == "FFNN": + if architecture == "FFNN" and backend == Backend.KERAS: result = model.predict(tf.convert_to_tensor([statistics]), args.batch_size, verbose=0) + elif architecture == "FFNN" and backend == Backend.PYTORCH: + result = model.predict([statistics], args.batch_size) + elif architecture == "LSTM" and backend == Backend.PYTORCH: + result = model.predict([ciphertext], args.batch_size) elif architecture in ("CNN", "LSTM", "Transformer"): input_length = get_model_input_length(model, architecture) if len(ciphertext) < input_length: @@ -399,7 +431,36 @@ def load_model(architecture, args, model_path, cipher_types): model = None - if architecture in ("FFNN", "CNN", "LSTM", "Transformer"): + if architecture == "FFNN" and model_path.endswith(".pth"): + checkpoint = torch.load(model_path, map_location=torch.device("cpu")) + + model = FFNN( + input_size=checkpoint['input_size'], + hidden_size=checkpoint['hidden_size'], + output_size=checkpoint['output_size'], + num_hidden_layers=checkpoint['num_hidden_layers'] + ) + model.load_state_dict(checkpoint['model_state_dict']) + model.eval() + + config.FEATURE_ENGINEERING = True + config.PAD_INPUT = False + elif architecture == "LSTM" and model_path.endswith(".pth"): + checkpoint = torch.load(model_path, map_location=torch.device("cpu")) + model = LSTM( + vocab_size=checkpoint['vocab_size'], + embed_dim=checkpoint['embed_dim'], + hidden_size=checkpoint['hidden_size'], + output_size=checkpoint['output_size'], + num_layers=checkpoint['num_layers'], + dropout=checkpoint['dropout'] + ) + model.load_state_dict(checkpoint['model_state_dict']) + model.eval() + + config.FEATURE_ENGINEERING = True + config.PAD_INPUT = False + elif architecture in ("FFNN", "CNN", "LSTM", "Transformer"): if architecture == 'Transformer': if not hasattr(config, "maxlen"): raise ValueError("maxlen must be defined in the config when loading a Transformer model!") @@ -427,16 +488,31 @@ def load_model(architecture, args, model_path, cipher_types): cipher_indices = [] for cipher_type in cipher_types: cipher_indices.append(config.CIPHER_TYPES.index(cipher_type)) - model = EnsembleModel(model_list, architecture_list, strategy, cipher_indices) + model_metadata = [] + for i, model in enumerate(model_list): + metadata = ModelMetadata(model, architecture_list[i], Backend.PYTORCH if model.endswith(".pth") else Backend.KERAS) + model_metadata.append(metadata) + model = EnsembleModel(model_metadata, strategy, cipher_indices) else: raise ValueError("Unknown architecture: %s" % architecture) - rotor_only_model_path = args.rotor_only_model - with open(rotor_only_model_path, "rb") as f: - rotor_only_model = pickle.load(f) + # Check if there are rotor ciphers among those requested + has_rotor_ciphers = any(c in config.ROTOR_CIPHER_TYPES for c in cipher_types) + + # If there are rotor ciphers, also load the rotor_only model. + if has_rotor_ciphers: + rotor_only_model_path = args.rotor_only_model + if not os.path.exists(rotor_only_model_path): + raise FileNotFoundError(f"Rotor-only model is required but not found at {rotor_only_model_path}") + with open(rotor_only_model_path, "rb") as f: + rotor_only_model = pickle.load(f) + return RotorDifferentiationEnsemble(architecture, model, rotor_only_model), "Ensemble" + + # If there are no rotor ciphers: + # - if it’s an ensemble, return the ensemble directly + # - otherwise return the normal model + return model, architecture - # Embed all models in RotorDifferentiationEnsemble to improve recognition of rotor ciphers - return RotorDifferentiationEnsemble(architecture, model, rotor_only_model) def expand_cipher_groups(cipher_types): """Turn cipher group identifiers (ACA, MTC3) into a list of their ciphers""" @@ -573,8 +649,8 @@ def main(): for arg in vars(args): print("{:23s}= {:s}".format(arg, str(getattr(args, arg)))) m = os.path.splitext(args.model) - if len(os.path.splitext(args.model)) != 2 or os.path.splitext(args.model)[1] != '.h5': - print('ERROR: The model name must have the ".h5" extension!', file=sys.stderr) + if os.path.splitext(args.model)[1] not in ('.h5', '.pth'): + print('ERROR: The model must have extension ".h5" (for Keras) or ".pth" (for PyTorch FFNN).', file=sys.stderr) sys.exit(1) architecture = args.architecture @@ -591,12 +667,15 @@ def main(): for i in range(len(args.models)): model = args.models[i] arch = args.architectures[i] - if not os.path.exists(os.path.abspath(model)): - raise ValueError("Model in %s does not exist." % os.path.abspath(model)) + abs_path = os.path.abspath(model) + if not os.path.exists(abs_path): + raise ValueError("Model in %s does not exist." % abs_path) if arch not in ('FFNN', 'CNN', 'LSTM', 'DT', 'NB', 'RF', 'ET', 'Transformer', 'SVM', 'kNN'): raise ValueError("Unallowed architecture %s" % arch) - if arch in ('FFNN', 'CNN', 'LSTM', 'Transformer') and not os.path.abspath(model).endswith('.h5'): - raise ValueError("Model names of the types %s must have the .h5 extension." % ['FFNN', 'CNN', 'LSTM', 'Transformer']) + if arch in ('CNN', 'Transformer') and not abs_path.endswith('.h5'): + raise ValueError("Model names of the types %s must have the .h5 extension." % ['CNN', 'Transformer']) + if arch in ('FFNN', 'LSTM') and not (abs_path.endswith('.h5') or abs_path.endswith('.pth')): + raise ValueError("Model names of the types %s must have the .h5 or .pth extension." % ['FFNN', 'LSTM']) elif args.models is not None or args.architectures is not None: raise ValueError("It is only allowed to use the --models and --architectures with the Ensemble architecture.") @@ -609,18 +688,19 @@ def main(): # model = load_model() # else: # model = load_model() - model = load_model(architecture, args, model_path, cipher_types) + model, architecture = load_model(architecture, args, model_path, cipher_types) print("Model Loaded.") - # Model is now always an ensemble - architecture = "Ensemble" + backend = Backend.KERAS + if architecture != "Ensemble" and model_path.endswith(".pth"): + backend = Backend.PYTORCH # the program was started as in benchmark mode. if args.download_dataset is not None: - benchmark(args, model, architecture) + benchmark(args, model, architecture, backend) # the program was started in single_line mode. elif args.ciphertext is not None or args.file is not None: - predict_single_line(args, model, architecture) + predict_single_line(args, model, architecture, backend) # the program was started in prediction mode. else: evaluate(args, model, architecture) diff --git a/cipherTypeDetection/miniBatchEarlyStoppingCallback.py b/cipherTypeDetection/miniBatchEarlyStoppingCallback.py index c3e480f..18cdbd2 100644 --- a/cipherTypeDetection/miniBatchEarlyStoppingCallback.py +++ b/cipherTypeDetection/miniBatchEarlyStoppingCallback.py @@ -87,7 +87,7 @@ def __init__(self, monitor='val_loss', min_delta=0, patience=0, verbose=1, mode= self.min_delta *= 1 else: self.min_delta *= -1 - self.best = np.Inf if self.monitor_op is np.less else -np.Inf + self.best = np.inf if self.monitor_op is np.less else -np.inf def on_epoch_end(self, epoch, logs=None): current = self.get_monitor_value(logs) diff --git a/cipherTypeDetection/models/ffnn.py b/cipherTypeDetection/models/ffnn.py new file mode 100644 index 0000000..89412c6 --- /dev/null +++ b/cipherTypeDetection/models/ffnn.py @@ -0,0 +1,36 @@ +import torch +from torch import nn +import torch.nn.functional as F + + +class FFNN(nn.Module): + def __init__(self, input_size, hidden_size, output_size, num_hidden_layers): + super().__init__() + + # saves parameters so that they can be saved and loaded later + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.num_hidden_layers = num_hidden_layers + + layers = [nn.Linear(input_size, hidden_size), nn.ReLU()] + for _ in range(num_hidden_layers - 1): + layers += [nn.Linear(hidden_size, hidden_size), nn.ReLU()] + layers.append(nn.Linear(hidden_size, output_size)) + self.net = nn.Sequential(*layers) + + def forward(self, x): + return self.net(x) + + @torch.no_grad + def predict(self, input, batch_size): + x = torch.tensor(input, dtype=torch.float32) + + outputs = [] + for i in range(0, len(x), batch_size): + batch = x[i : i + batch_size] + out = self(batch) + outputs.append(out) + outputs = torch.cat(outputs, dim=0) + + return F.softmax(outputs, dim=1) diff --git a/cipherTypeDetection/models/lstm.py b/cipherTypeDetection/models/lstm.py new file mode 100644 index 0000000..b4633ad --- /dev/null +++ b/cipherTypeDetection/models/lstm.py @@ -0,0 +1,72 @@ +import torch +from torch import nn +import torch.nn.functional as F + + +class LSTM(nn.Module): + def __init__( + self, vocab_size, embed_dim, hidden_size, output_size, num_layers=1, dropout=0.0 + ): + super().__init__() + + # saves parameters so that they can be saved and loaded later + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.hidden_size = hidden_size + self.output_size = output_size + self.num_layers = num_layers + self.dropout = dropout + + # Layers + self.embedding = nn.Embedding( + num_embeddings=vocab_size, embedding_dim=embed_dim, padding_idx=0 + ) + self.lstm = nn.LSTM( + input_size=embed_dim, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout if num_layers > 1 else 0.0, + ) + self.fc = nn.Linear(hidden_size, output_size) + + # B: Batch size – number of sequences processed in parallel + # L: Sequence length – number of time steps (tokens) in each sequence + # D: Embedding dimension – size of each token’s embedding vector + # H: Hidden size – number of features in the LSTM hidden state + # C: Number of classes – dimensionality of the output logits + + def forward(self, x): + # x: LongTensor of shape [B, L] or [B, L, 1] + if x.dim() == 3 and x.size(2) == 1: + x = x.squeeze(2) # remove channel dimension → [B, L] + + emb = self.embedding(x) # embeddings → [B, L, D] + + # LSTM returns: + # - output: hidden state at each time step → [B, L, H] + # - hidden: final hidden state for each layer → [num_layers, B, H] + # not used as we only need the last hidden state, but can be useful for debugging + output, (hidden, _) = self.lstm(emb) + + # hidden[-1] selects the final hidden state of the top (last) layer + # at the last time step → [B, H] + last_hidden = hidden[-1] + + # apply the fully-connected layer to get logits → [B, C] + logits = self.fc(last_hidden) + + return logits + + @torch.no_grad + def predict(self, input, batch_size): + x = torch.tensor(input, dtype=torch.int) + + outputs = [] + for i in range(0, len(x), batch_size): + batch = x[i : i + batch_size] + out = self(batch) + outputs.append(out) + outputs = torch.cat(outputs, dim=0) + + return F.softmax(outputs, dim=1) diff --git a/cipherTypeDetection/rotorDifferentiationEnsemble.py b/cipherTypeDetection/rotorDifferentiationEnsemble.py index 8f33088..3ece34c 100644 --- a/cipherTypeDetection/rotorDifferentiationEnsemble.py +++ b/cipherTypeDetection/rotorDifferentiationEnsemble.py @@ -1,10 +1,12 @@ import numpy as np import tensorflow as tf from tensorflow.keras.preprocessing.sequence import pad_sequences +from torch.nn import Module from cipherImplementations.cipher import OUTPUT_ALPHABET import cipherTypeDetection.config as config from cipherTypeDetection.featureCalculations import calculate_rotor_statistics from util.utils import get_model_input_length +from cipherTypeDetection.config import Backend class RotorDifferentiationEnsemble: """ @@ -37,6 +39,11 @@ def __init__(self, general_model_architecture, general_model, rotor_only_model): """ self._general_architecture = general_model_architecture self._general_model = general_model + self._general_model_backend = ( + Backend.PYTORCH + if isinstance(self._general_model, Module) + else Backend.KERAS + ) self._rotor_only_model = rotor_only_model def predict(self, statistics, ciphertexts, batch_size, verbose=0): @@ -69,7 +76,12 @@ def predict(self, statistics, ciphertexts, batch_size, verbose=0): # Perform full prediction for all ciphers architecture = self._general_architecture - if architecture in ("DT", "NB", "RF", "ET", "SVM", "kNN"): + backend = self._general_model_backend + if backend == Backend.PYTORCH: + if isinstance(statistics, tf.Tensor): + statistics = statistics.numpy() + predictions = self._general_model.predict(statistics, batch_size).numpy() + elif architecture in ("DT", "NB", "RF", "ET", "SVM", "kNN"): predictions = self._general_model.predict_proba(statistics) elif architecture == "Ensemble": predictions = self._general_model.predict(statistics, diff --git a/cipherTypeDetection/train.py b/cipherTypeDetection/train.py index 7d00dd7..f117d22 100755 --- a/cipherTypeDetection/train.py +++ b/cipherTypeDetection/train.py @@ -10,6 +10,15 @@ import math import pickle import functools + +# PyTorch +import torch +import torch.nn as nn +import torch.optim as optim +from torchinfo import summary +import numpy as np +from torch.utils.data import TensorDataset, DataLoader + from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.tree import DecisionTreeClassifier, plot_tree @@ -35,18 +44,182 @@ from cipherTypeDetection.miniBatchEarlyStoppingCallback import MiniBatchEarlyStopping from cipherTypeDetection.transformer import TransformerBlock, TokenAndPositionEmbedding from cipherTypeDetection.learningRateSchedulers import TimeBasedDecayLearningRateScheduler, CustomStepDecayLearningRateScheduler +from cipherTypeDetection.models.ffnn import FFNN +from cipherTypeDetection.models.lstm import LSTM +from cipherTypeDetection.config import Backend + tf.debugging.set_log_device_placement(enabled=False) # always flush after print as some architectures like RF need very long time before printing anything. print = functools.partial(print, flush=True) for device in tf.config.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(device, True) + +def train_torch(model, args, train_ds, feature_engineering): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + + optimizer = optim.Adam( + model.parameters(), + lr=config.learning_rate, + betas=(config.beta_1, config.beta_2), + eps=config.epsilon, + amsgrad=config.amsgrad + ) + criterion = nn.CrossEntropyLoss() + model.train() + + best_val_acc = 0 + patience_counter = 0 + patience_limit = 250 + + train_iter = 0 + train_epoch = 0 + start_time = time.time() + + val_data_created = False + x_val = y_val = None + + for epoch in range(args.epochs): + while train_ds.iteration < args.max_iter: + training_batches = next(train_ds) + for training_batch in training_batches: + statistics, labels = training_batch.items() + statistics = statistics.numpy() + labels = labels.numpy() + if not feature_engineering: + statistics = statistics.astype(int) + + if not val_data_created: + x_train_np, x_val_np, y_train_np, y_val_np = train_test_split( + statistics, labels, test_size=0.3 + ) + x_val = torch.tensor(x_val_np, dtype=torch.float32).to(device) + if not feature_engineering: + x_val = x_val.int() + y_val = torch.tensor(y_val_np, dtype=torch.long).to(device) + val_data_created = True + else: + x_train_np = statistics + y_train_np = labels + + # Use DataLoader for creating minibatch + x_train = torch.tensor(x_train_np, dtype=torch.float32) + if not feature_engineering: + x_train = x_train.int() + y_train = torch.tensor(y_train_np, dtype=torch.long) + + train_dataset = TensorDataset(x_train, y_train) + train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) + + batch_losses = [] + + for x_batch, y_batch in train_loader: + x_batch = x_batch.to(device) + y_batch = y_batch.to(device) + + optimizer.zero_grad() + outputs = model(x_batch) + loss = criterion(outputs, y_batch) + loss.backward() + optimizer.step() + + batch_losses.append(loss.item()) + train_iter += len(y_batch) + + epoch_loss = sum(batch_losses) / len(batch_losses) + + # --- Validation step --- + model.eval() + with torch.no_grad(): + val_outputs = model(x_val) + val_loss = criterion(val_outputs, y_val) + val_pred = torch.argmax(val_outputs, dim=1) + val_acc = (val_pred == y_val).float().mean().item() + + top3 = torch.topk(val_outputs, k=3, dim=1).indices + y_val_exp = y_val.unsqueeze(1).expand_as(top3) + val_k3 = (top3 == y_val_exp).any(dim=1).float().mean().item() + + print(f"Epoch: {epoch+1}, Iteration: {train_iter}, " + f"Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss.item():.4f}, " + f"Val Acc: {val_acc:.4f}, Val Top-3 Acc: {val_k3:.4f}") + model.train() + + # --- Early stopping check --- + if val_acc > best_val_acc: + best_val_acc = val_acc + patience_counter = 0 + else: + patience_counter += 1 + if patience_counter >= patience_limit: + print("Early stopping triggered.") + elapsed = time.time() - start_time + t = time.gmtime(elapsed) + print(f"Finished training in {t.tm_yday - 1} days {t.tm_hour} hours {t.tm_min} minutes {t.tm_sec} seconds with {train_iter} iterations.") + class DummyEarlyStopping: stop_training = True + return DummyEarlyStopping(), train_iter, f"Early stopped at epoch {epoch+1}" + + if train_iter >= args.max_iter: + break + if train_iter >= args.max_iter: + break + train_epoch += 1 + + elapsed = time.time() - start_time + t = time.gmtime(elapsed) + print(f"Finished training in {t.tm_yday - 1} days {t.tm_hour} hours {t.tm_min} minutes {t.tm_sec} seconds with {train_iter} iterations.") + class DummyEarlyStopping: stop_training = False + return DummyEarlyStopping(), train_iter, f"Trained for {train_epoch} epochs" + + +def predict_torch(model, args, statistics, labels, feature_engineering): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.eval() + model.to(device) + criterion = nn.CrossEntropyLoss() + + with torch.no_grad(): + statistics = statistics.numpy() + x = torch.tensor(statistics, dtype=torch.float32).to(device) + if not feature_engineering: + x = x.int() + + y = torch.tensor(labels.numpy(), dtype=torch.long).to(device) + + outputs = model(x) + loss = criterion(outputs, y) + + pred_top1 = torch.argmax(outputs, dim=1) + acc = (pred_top1 == y).float().mean().item() + + top3 = torch.topk(outputs, k=3, dim=1).indices + y_expanded = y.unsqueeze(1).expand_as(top3) + k3_acc = (top3 == y_expanded).any(dim=1).float().mean().item() + + print(f"Eval → Loss: {loss.item():.4f}, Accuracy: {acc:.4f}, Top-3 Accuracy: {k3_acc:.4f}") + + preds = torch.softmax(outputs, dim=1).cpu().numpy() + + return preds, labels.numpy() def str2bool(v): return v.lower() in ("yes", "true", "t", "1") -def create_model_with_distribution_strategy(architecture, extend_model, output_layer_size, max_train_len): +def print_model_summary(architecture, model, backend, max_train_len): + if backend == Backend.KERAS: + model.summary() + elif backend == Backend.PYTORCH: + # for LSTM use a LongTensor dummy input of shape (1, max_train_len) + if architecture == "LSTM": + summary(model, input_size=(1, max_train_len), dtypes=[torch.long]) + else: + summary(model, input_size=(1, 724)) + else: + raise ValueError(f"Unknown backend {backend}") + +def create_model_with_distribution_strategy(architecture, backend, extend_model, output_layer_size, max_train_len): """Creates models depending on the GPU count and on extend_model""" print('Creating model...') @@ -62,7 +235,8 @@ def create_model_with_distribution_strategy(architecture, extend_model, output_l extend_model = tf.keras.models.load_model(extend_model, compile=False) model = create_model(architecture, extend_model, output_layer_size, max_train_len) if architecture in ("FFNN", "CNN", "LSTM", "Transformer") and extend_model is None: - model.summary() + print_model_summary(architecture, model, backend, max_train_len) + else: print("Only one GPU found.") strategy = NullDistributionStrategy() @@ -70,7 +244,8 @@ def create_model_with_distribution_strategy(architecture, extend_model, output_l extend_model = tf.keras.models.load_model(extend_model, compile=False) model = create_model(architecture, extend_model, output_layer_size, max_train_len) if architecture in ("FFNN", "CNN", "LSTM", "Transformer") and extend_model is None: - model.summary() + print_model_summary(architecture, model, backend, max_train_len) + print('Model created.\n') return model, strategy @@ -115,13 +290,13 @@ def create_model(architecture, extend_model, output_layer_size, max_train_len): # Create new model based on architecture if architecture == "FFNN": - model = tf.keras.Sequential() - model.add(tf.keras.layers.Input(shape=(input_layer_size,))) - for _ in range(config.hidden_layers): - model.add(tf.keras.layers.Dense(hidden_layer_size, activation='relu', use_bias=True)) - model.add(tf.keras.layers.Dense(output_layer_size, activation='softmax')) - model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", - metrics=["accuracy", SparseTopKCategoricalAccuracy(k=3, name="k3_accuracy")]) + # Use PyTorch for FFNN + model = FFNN( + input_size=input_layer_size, + hidden_size=hidden_layer_size, + output_size=output_layer_size, + num_hidden_layers=config.hidden_layers + ) return model elif architecture == "CNN": @@ -144,16 +319,16 @@ def create_model(architecture, extend_model, output_layer_size, max_train_len): elif architecture == "LSTM": config.FEATURE_ENGINEERING = False config.PAD_INPUT = True - model = tf.keras.Sequential() - model.add(tf.keras.layers.Embedding(56, 64, input_length=max_train_len)) - # model_.add(tf.keras.layers.Dropout(0.2)) - model.add(tf.keras.layers.LSTM(config.lstm_units)) - # model_.add(tf.keras.layers.Dropout(0.2)) - model.add(tf.keras.layers.Flatten()) - model.add(tf.keras.layers.Dense(output_layer_size, activation='softmax')) - model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", - metrics=["accuracy", SparseTopKCategoricalAccuracy(k=3, name="k3_accuracy")]) + model = LSTM( + vocab_size=56, + embed_dim=64, + hidden_size=config.lstm_units, + output_size=output_layer_size, + num_layers=1, + dropout=0.0 + ) return model + elif architecture == "DT": return DecisionTreeClassifier(criterion=config.criterion, ccp_alpha=config.ccp_alpha) @@ -263,8 +438,10 @@ def parse_arguments(): help='Directory for saving generated models. \n' 'When interrupting, the current model is \n' 'saved as interrupted_...') - parser.add_argument('--model_name', default='m.h5', type=str, - help='Name of the output model file. The file must \nhave the .h5 extension.') + parser.add_argument('--model_name', type=str, + help='Name of the output model file. The file must \n' + 'have the .h5 or .pth extension for Keras models or \n' + 'PyTorch models, respectively.') parser.add_argument('--ciphers', default='all', type=str, help='A comma seperated list of the ciphers to be created.\n' 'Be careful to not use spaces or use \' to define the string.\n' @@ -806,23 +983,40 @@ def create_checkpoint_callback(): print(training_stats) return early_stopping_callback, train_iter, training_stats -def save_model(model, args): +def save_model(model, args, backend): """Writes the model and the commandline arguments to disk.""" - print('Saving model...') architecture = args.architecture + if not os.path.exists(args.save_directory): os.mkdir(args.save_directory) - if args.model_name == 'm.h5': - i = 1 - while os.path.exists(os.path.join(args.save_directory, args.model_name.split('.')[0] + str(i) + '.h5')): - i += 1 - model_name = args.model_name.split('.')[0] + str(i) + '.h5' - else: - model_name = args.model_name + + model_name = args.model_name + if backend == Backend.PYTORCH and not model_name.endswith(".pth"): + model_name = model_name + '.pth' + model_path = os.path.join(args.save_directory, model_name) - if architecture in ("FFNN", "CNN", "LSTM", "Transformer"): + if architecture in ("FFNN", "LSTM"): + state_dict = { + 'model_state_dict': model.state_dict(), + 'hidden_size': model.hidden_size, + 'output_size': model.output_size, + } + + if architecture == "FFNN": + state_dict['input_size'] = model.input_size + state_dict['num_hidden_layers'] = model.num_hidden_layers + elif architecture == "LSTM": + state_dict['vocab_size'] = model.vocab_size + state_dict['embed_dim'] = model.embed_dim + state_dict['num_layers'] = model.num_layers + state_dict['dropout'] = model.dropout + + torch.save(state_dict, model_path) + + + elif architecture in ("CNN", "Transformer"): model.save(model_path) elif architecture in ("DT", "NB", "RF", "ET", "SVM", "kNN", "SVM-Rotor"): @@ -837,30 +1031,32 @@ def save_model(model, args): pickle.dump(model[1], f) elif architecture == "[DT,ET,RF,SVM,kNN]": - for index, name in enumerate(["dt","et","rf","svm","knn"]): + for index, name in enumerate(["dt", "et", "rf", "svm", "knn"]): # TODO: Are these files actually in the h5 format? Probably not! with open('../data/models/' + model_path.split('.')[0] + f"_{name}.h5", "wb") as f: # this gets very large pickle.dump(model[index], f) - - # Write user provided commandline arguments into mode path + + # Write user provided commandline arguments into model path with open('../data/' + model_path.split('.')[0] + '_parameters.txt', 'w') as f: for arg in vars(args): f.write("{:23s}= {:s}\n".format(arg, str(getattr(args, arg)))) - # Remove logs of previous run - if architecture in ("FFNN", "CNN", "LSTM", "Transformer"): + # Managing logs + if architecture in ("CNN", "Transformer"): logs_destination = '../data/' + model_name.split('.')[0] + '_tensorboard_logs' try: - if os.path.exists(logs_destination): - shutil.rmtree(logs_destination) - shutil.move('../data/logs', logs_destination) + if os.path.exists('../data/logs'): + if os.path.exists(logs_destination): + shutil.rmtree(logs_destination) + shutil.move('../data/logs', logs_destination) except Exception: - print(f"Could not remove logs of previous run. Move of current logs " - f"from '../data/logs' to '{logs_destination}' failed.") - + print(f"Could not move logs from '../data/logs' to '{logs_destination}'.") + print('Model saved.\n') + + def predict_test_data(test_ds, model, args, early_stopping_callback, train_iter): """ Testing the predictions of the model. @@ -900,7 +1096,7 @@ def predict_test_data(test_ds, model, args, early_stopping_callback, train_iter) cntr = 0 test_iter = 0 test_epoch = 0 - + # Determine the number of iterations to use for evaluating the model prediction_dataset_factor = 10 if early_stopping_callback.stop_training: @@ -969,6 +1165,12 @@ def predict_test_data(test_ds, model, args, early_stopping_callback, train_iter) prediction_metrics["RF"].add_predictions(labels, model[2].predict_proba(statistics)) prediction_metrics["SVM"].add_predictions(labels, model[3].predict_proba(statistics)) prediction_metrics["kNN"].add_predictions(labels, model[4].predict_proba(statistics)) + elif architecture == "FFNN": + prediction, labels = predict_torch(model, args, statistics, labels, feature_engineering=True) + prediction_metrics[architecture].add_predictions(labels, prediction) + elif architecture == "LSTM": + prediction, labels = predict_torch(model, args, statistics, labels, feature_engineering=False) + prediction_metrics[architecture].add_predictions(labels, prediction) else: prediction = model.predict(statistics, batch_size=args.batch_size, verbose=1) prediction_metrics[architecture].add_predictions(labels, prediction) @@ -1056,13 +1258,22 @@ def main(): architecture = args.architecture extend_model = args.extend_model + backend = Backend.KERAS + if architecture == "FFNN" or architecture == "LSTM": + backend = Backend.PYTORCH + # Validate inputs - if len(os.path.splitext(args.model_name)) != 2 or os.path.splitext(args.model_name)[1] != '.h5': - print('ERROR: The model name must have the ".h5" extension!', file=sys.stderr) + if os.path.splitext(args.model_name)[1] not in ('.h5', '.pth'): + print('ERROR: The model must have extension ".h5" (for Keras) or ".pth" (for PyTorch).', file=sys.stderr) + sys.exit(1) + + if backend == Backend.PYTORCH and os.path.splitext(args.model_name)[1] != ".pth": + print("ERROR: PyTorch models must have .pth file extension.") sys.exit(1) + if extend_model is not None: - if architecture not in ('FFNN', 'CNN', 'LSTM'): + if architecture not in ('CNN'): print('ERROR: Models with the architecture %s can not be extended!' % architecture, file=sys.stderr) sys.exit(1) @@ -1101,18 +1312,23 @@ def main(): output_layer_size = max([config.CIPHER_TYPES.index(type) for type in cipher_types]) + 1 # Create a model and allow for distributed training on multi-GPU machines - model, strategy = create_model_with_distribution_strategy(architecture, - extend_model, - output_layer_size=output_layer_size, - max_train_len=args.max_train_len) + model, strategy = create_model_with_distribution_strategy( + architecture, backend, extend_model, output_layer_size=output_layer_size, max_train_len=args.max_train_len) + - early_stopping_callback, train_iter, training_stats = train_model(model, strategy, + if backend == Backend.KERAS: + early_stopping_callback, train_iter, training_stats = train_model(model, strategy, args, train_ds) - save_model(model, args) + elif backend == Backend.PYTORCH: + early_stopping_callback, train_iter, training_stats = train_torch(model, args, train_ds, config.FEATURE_ENGINEERING) + else: + raise ValueError(f"Unkown backend: {backend}") + + save_model(model, args, backend) prediction_stats = predict_test_data(test_ds, model, args, early_stopping_callback, train_iter) print(training_stats) print(prediction_stats) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9e688c7..d5d7893 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ tensorflow_datasets==4.9.4 scikit_learn==1.4.0 # do not change this to be able to load old models! h5py==3.10.0 pandas==2.2.0 +torch==2.7.0 +torchinfo==0.1.0