@inproceedings{b7ab279cf45f41e2b771a518d610bcbd,
title = "FREQUENCY-GUIDED CONTEXTUAL IMAGE CAPTIONING",
abstract = "Both a deep understanding of visual cues and their contextual importance are demanded by effective image captioning. However, seamlessly integrating balanced contextual information continues to be a substantial challenge. In this paper, we present FreConCap, a novel Frequency-guided Contextual Image Captioning framework, to overcome the challenge using high-frequency and background features, along with object-level region features. We transform grid features into frequency domain and filter out low-frequency components by a cutoff ratio that enhances fine details critical for detailed visual understanding. Multi-Stream Cross Attention is developed to reduce the modality gap between vision and language, and to capture the interaction of text features with high-frequency local features, objects, context, and their relationships. Our experiments on the MS COCO image captioning benchmark show the superiority of our approach as compared with existing methods for enhanced image captions with more contextual information.",
keywords = "Encoder-Decoder, Frequency-Guided Feature, Image Captioning, Transformer",
author = "Rubel, \{Al Shahriar\} and Shih, \{Frank Y.\} and Deek, \{Fadi P.\}",
note = "Publisher Copyright: {\textcopyright}2025 IEEE.; 32nd IEEE International Conference on Image Processing, ICIP 2025 ; Conference date: 14-09-2025 Through 17-09-2025",
year = "2025",
doi = "10.1109/ICIP55913.2025.11084632",
language = "English (US)",
series = "Proceedings - International Conference on Image Processing, ICIP",
publisher = "IEEE Computer Society",
pages = "1229--1234",
booktitle = "2025 IEEE International Conference on Image Processing, ICIP 2025 - Proceedings",
address = "United States",
}