Yamazaki et al. "VLTinT: Visual-Linguistic Transformer-in-Transformer for Coherent Video Paragraph Captioning." AAAI Conference on Artificial Intelligence, 2023. doi:10.1609/AAAI.V37I3.25412
Markdown
[Yamazaki et al. "VLTinT: Visual-Linguistic Transformer-in-Transformer for Coherent Video Paragraph Captioning." AAAI Conference on Artificial Intelligence, 2023.](https://mlanthology.org/aaai/2023/yamazaki2023aaai-vltint/) doi:10.1609/AAAI.V37I3.25412
BibTeX
@inproceedings{yamazaki2023aaai-vltint,
title = {{VLTinT: Visual-Linguistic Transformer-in-Transformer for Coherent Video Paragraph Captioning}},
author = {Yamazaki, Kashu and Vo, Khoa and Truong, Quang Sang and Raj, Bhiksha and Le, Ngan},
booktitle = {AAAI Conference on Artificial Intelligence},
year = {2023},
pages = {3081-3090},
doi = {10.1609/AAAI.V37I3.25412},
url = {https://mlanthology.org/aaai/2023/yamazaki2023aaai-vltint/}
}