@article{M55D17CB6, title = "ViT-Based Future Road Image Prediction: Evaluation via VLM", journal = "The Journal of Korean Institute of Communications and Information Sciences", year = "2025", issn = "1226-4717", doi = "10.7840/kics.2025.50.10.1532", author = "Donghyun Kim, Jaerock Kwon, Haewoon Nam", keywords = "Autonomous Driving, Vision-, Language Model, Semantic, Evaluation, Vision Transformer", abstract = "This paper proposes a Vision Transformer (ViT)-based model for predicting future driving scenes. The proposed ViT architecture processes input images as patches and leverages the attention mechanism to efficiently learn global visual information, while also integrating control inputs to effectively capture correlations between visual context and driving actions. Experimental results show that the ViT-based model generates sharper images than the baseline and achieves higher semantic similarity in explanation evaluations using a Vision-Language Model (VLM). These results suggest that the ViT architecture is effective not only for future prediction but also for explainable autonomous driving control." }