Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/testAndPublish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ jobs:
- startupShutdown
- symbols
- vscode
- imageDescriptions
- chrome_annotations
- chrome_list
- chrome_table
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ dependencies = [
"l2m4m==1.0.4",
"pyyaml==6.0.3",
"pymdown-extensions==10.17.1",
# local image caption
"onnxruntime==1.23.2",
"numpy==2.3.5",
]

[project.urls]
Expand Down Expand Up @@ -335,6 +338,7 @@ system-tests = [
"robotframework==7.3.2",
"robotremoteserver==1.1.1",
"robotframework-screencaplibrary==1.6.0",
"onnx==1.19.1",
]
unit-tests = [
# Creating XML unit test reports
Expand Down
4 changes: 4 additions & 0 deletions source/NVDAState.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ def voiceDictsBackupDir(self) -> str:
def updatesDir(self) -> str:
return os.path.join(self.configDir, "updates")

@property
def modelsDir(self) -> str:
return os.path.join(self.configDir, "models")

@property
def nvdaConfigFile(self) -> str:
return os.path.join(self.configDir, "nvda.ini")
Expand Down
44 changes: 44 additions & 0 deletions source/_localCaptioner/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2025 NV Access Limited, Tianze
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

from logHandler import log

from .imageDescriber import ImageDescriber
from . import modelConfig

_localCaptioner: ImageDescriber | None = None


def initialize():
"""Initialise the local captioner."""
global _localCaptioner
log.debug("Initializing local captioner")
modelConfig.initialize()
_localCaptioner = ImageDescriber()


def terminate():
"""Terminate the local captioner."""
global _localCaptioner
if _localCaptioner is None:
log.error("local captioner not running")
return
log.debug("Terminating local captioner")
_localCaptioner.terminate()
_localCaptioner = None


def isModelLoaded() -> bool:
"""return if model is loaded"""
if _localCaptioner is not None:
return _localCaptioner.isModelLoaded
else:
return False


def toggleImageCaptioning() -> None:
"""do load/unload the model from memory."""
if _localCaptioner is not None:
_localCaptioner.toggleSwitch()
53 changes: 53 additions & 0 deletions source/_localCaptioner/captioner/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2025 NV Access Limited, Tianze
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

import json

from logHandler import log
from .base import ImageCaptioner


def imageCaptionerFactory(
configPath: str,
encoderPath: str | None = None,
decoderPath: str | None = None,
monomericModelPath: str | None = None,
) -> ImageCaptioner:
"""Initialize the image caption generator.

:param monomericModelPath: Path to a single merged model file.
:param encoderPath: Path to the encoder model file.
:param decoderPath: Path to the decoder model file.
:param configPath: Path to the configuration file.
:raises ValueError: If neither a single model nor both encoder and decoder are provided.
:raises FileNotFoundError: If config file not found.
:raises NotImplementedError: if model architecture is unsupported
:raises Exception: If config.json fail to load.
:return: instance of ImageCaptioner
"""
if not monomericModelPath and not (encoderPath and decoderPath):
raise ValueError(
"You must provide either 'monomericModelPath' or both 'encoderPath' and 'decoderPath'.",
)

try:
with open(configPath, "r", encoding="utf-8") as f:
config = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(
f"Caption model config file {configPath} not found, "
"please download models and config file first!",
)
except Exception:
log.exception("config file not found")
raise

modelArchitecture = config["architectures"][0]
if modelArchitecture == "VisionEncoderDecoderModel":
from .vitGpt2 import VitGpt2ImageCaptioner

return VitGpt2ImageCaptioner(encoderPath, decoderPath, configPath)
else:
raise NotImplementedError("Unsupported model architectures")
24 changes: 24 additions & 0 deletions source/_localCaptioner/captioner/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2025 NV Access Limited, Tianze
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

from abc import ABC, abstractmethod


class ImageCaptioner(ABC):
"""Abstract interface for image caption generation.

Supports generate caption for image
"""

@abstractmethod
def generateCaption(self, image: str | bytes, maxLength: int | None = None) -> str:
"""
Generate a caption for the given image.

:param image: Image file path or binary data.
:param maxLength: Optional maximum length for the generated caption.
:return: The generated image caption as a string.
"""
pass
Loading
Loading