@article{ME8D10812,
title = "Pre-Trained Large Language Model Pipeline for Anomaly Detection Based on the MITRE ATT&CK Framework",
journal = "The Journal of Korean Institute of Communications and Information Sciences",
year = "2025",
issn = "1226-4717",
doi = "10.7840/kics.2025.50.10.1631",
author = "Kyuchang Kang, Yu-Jin So, Jong-Geun Park",
keywords = "Anomaly Detection, Pre-trained Large Language Model (LLM), MITRE ATT&CK Framework, , Network Logs Analysis, Feature Engineering, Cybersecurity",
abstract = "In this paper, we propose a Large Language Model (LLM) pipeline utilizing the UWF-ZeekData22 dataset based on MITRE ATT&CK Matrix to address the growing cyber threats in modern society. We first performed an exploratory data analysis (EDA) to derive key feature groups that reflect the spatio-temporal characteristics and connectivity of network traffic logs. The derived feature groups are used to generate input sequences for pre-training the BERT model. In the pre-training phase, we applied a masked language model (MLM) task to effectively learn network traffic patterns and achieved a mask prediction accuracy of over 0.9.
In the fine-tuning and inference phase, we optimized the models for anomaly detection by adopting a weighted sampling technique to handle the imbalance problem of each tactic in the dataset. The performance evaluation showed that all models had an accuracy above 0.94 and an AUC-ROC value close to 1.0. We also analyzed the impact of the padding method according to model size and found that static padding performed better for large models, while dynamic padding performed better for small models. These results demonstrate that LLM-based pre-training can successfully learn complex patterns of network traffic logs and can reliably detect various tactics. Therefore, the proposal of this paper is expected to provide a practical case study in the modernization of network security systems and the development of real-time security monitoring solutions."
}