@inbook{60df40bf577646d8aa779dc12132bf5d,
title = "Scene text visual question answering",
abstract = "Current visual question answering datasets do not consider the rich semantic information conveyed by text within an image. In this work, we present a new dataset, ST-VQA, that aims to highlight the importance of exploiting high-level semantic information present in images as textual cues in the Visual Question Answering process. We use this dataset to define a series of tasks of increasing difficulty for which reading the scene text in the context provided by the visual information is necessary to reason and generate an appropriate answer. We propose a new evaluation metric for these tasks to account both for reasoning errors as well as shortcomings of the text recognition module. In addition we put forward a series of baseline methods, which provide further insight to the newly released dataset, and set the scene for further research.",
author = "Biten, {Ali Furkan} and Ruben Tito and Andres Mafla and Lluis Gomez and Marcal Rusinol and Jawahar, {C. V.} and Ernest Valveny and DImosthenis Karatzas",
note = "Funding Information: This work has been supported by projects TIN2017-89779-P, Marie-Curie (712949 TECNIOspring PLUS), aBSINTHE (Fundacion BBVA 2017), the CERCA Programme / Generalitat de Catalunya, a European Social Fund grant (CCI: 2014ES05SFOP007), NVIDIA Corporation and PhD scholarships from AGAUR (2019-FIB01233) and the UAB. Publisher Copyright: {\textcopyright} 2019 IEEE.",
year = "2019",
month = oct,
doi = "10.1109/ICCV.2019.00439",
language = "English",
series = "Proceedings of the IEEE International Conference on Computer Vision",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "4290--4300",
booktitle = "Proceedings - 2019 International Conference on Computer Vision, ICCV 2019",
address = "United States",
}