nvaccess · seanbudd · Jan 16, 2026 · Jan 30, 2026 · Jan 30, 2026
@@ -401,6 +401,7 @@ jobs:
           - startupShutdown
           - symbols
           - vscode
+          - imageDescriptions
           - chrome_annotations
           - chrome_list
           - chrome_table

@@ -48,6 +48,9 @@ dependencies = [
 	"l2m4m==1.0.4",
 	"pyyaml==6.0.3",
 	"pymdown-extensions==10.17.1",
+	# local image caption
+	"onnxruntime==1.23.2",
+	"numpy==2.3.5",
 ]
 
 [project.urls]
@@ -335,6 +338,7 @@ system-tests = [
 	"robotframework==7.3.2",
 	"robotremoteserver==1.1.1",
 	"robotframework-screencaplibrary==1.6.0",
+	"onnx==1.19.1",
 ]
 unit-tests = [
 	# Creating XML unit test reports

@@ -67,6 +67,10 @@ def voiceDictsBackupDir(self) -> str:
 	def updatesDir(self) -> str:
 		return os.path.join(self.configDir, "updates")
 
+	@property
+	def modelsDir(self) -> str:
+		return os.path.join(self.configDir, "models")
+
 	@property
 	def nvdaConfigFile(self) -> str:
 		return os.path.join(self.configDir, "nvda.ini")

@@ -0,0 +1,44 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+from logHandler import log
+
+from .imageDescriber import ImageDescriber
+from . import modelConfig
+
+_localCaptioner: ImageDescriber | None = None
+
+
+def initialize():
+	"""Initialise the local captioner."""
+	global _localCaptioner
+	log.debug("Initializing local captioner")
+	modelConfig.initialize()
+	_localCaptioner = ImageDescriber()
+
+
+def terminate():
+	"""Terminate the local captioner."""
+	global _localCaptioner
+	if _localCaptioner is None:
+		log.error("local captioner not running")
+		return
+	log.debug("Terminating local captioner")
+	_localCaptioner.terminate()
+	_localCaptioner = None
+
+
+def isModelLoaded() -> bool:
+	"""return if model is loaded"""
+	if _localCaptioner is not None:
+		return _localCaptioner.isModelLoaded
+	else:
+		return False
+
+
+def toggleImageCaptioning() -> None:
+	"""do load/unload the model from memory."""
+	if _localCaptioner is not None:
+		_localCaptioner.toggleSwitch()
@@ -0,0 +1,53 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+import json
+
+from logHandler import log
+from .base import ImageCaptioner
+
+
+def imageCaptionerFactory(
+	configPath: str,
+	encoderPath: str | None = None,
+	decoderPath: str | None = None,
+	monomericModelPath: str | None = None,
+) -> ImageCaptioner:
+	"""Initialize the image caption generator.
+
+	:param monomericModelPath: Path to a single merged model file.
+	:param encoderPath: Path to the encoder model file.
+	:param decoderPath: Path to the decoder model file.
+	:param configPath: Path to the configuration file.
+	:raises ValueError: If neither a single model nor both encoder and decoder are provided.
+	:raises FileNotFoundError: If config file not found.
+	:raises NotImplementedError: if model architecture is unsupported
+	:raises Exception: If config.json fail to load.
+	:return: instance of ImageCaptioner
+	"""
+	if not monomericModelPath and not (encoderPath and decoderPath):
+		raise ValueError(
+			"You must provide either 'monomericModelPath' or both 'encoderPath' and 'decoderPath'.",
+		)
+
+	try:
+		with open(configPath, "r", encoding="utf-8") as f:
+			config = json.load(f)
+	except FileNotFoundError:
+		raise FileNotFoundError(
+			f"Caption model config file {configPath} not found, "
+			"please download models and config file first!",
+		)
+	except Exception:
+		log.exception("config file not found")
+		raise
+
+	modelArchitecture = config["architectures"][0]
+	if modelArchitecture == "VisionEncoderDecoderModel":
+		from .vitGpt2 import VitGpt2ImageCaptioner
+
+		return VitGpt2ImageCaptioner(encoderPath, decoderPath, configPath)
+	else:
+		raise NotImplementedError("Unsupported model architectures")
@@ -0,0 +1,24 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+from abc import ABC, abstractmethod
+
+
+class ImageCaptioner(ABC):
+	"""Abstract interface for image caption generation.
+
+	Supports generate caption for image
+	"""
+
+	@abstractmethod
+	def generateCaption(self, image: str | bytes, maxLength: int | None = None) -> str:
+		"""
+		Generate a caption for the given image.
+
+		:param image: Image file path or binary data.
+		:param maxLength: Optional maximum length for the generated caption.
+		:return: The generated image caption as a string.
+		"""
+		pass