Conversation
This reverts commit 9f3aecb.
| pgettext("remote", "Action unavailable when Remote Access is disabled"), | ||
| ) | ||
| SCREEN_CURTAIN = ( | ||
| lambda: _isScreenCurtainEnabled(), |
|
|
||
|
|
||
| # Module-level configuration | ||
| _localCaptioner = None |
|
|
||
|
|
||
| # Default configuration instances | ||
| _DEFAULT_ENCODER_CONFIG: _EncoderConfig | None = None |
|
|
||
| # Default configuration instances | ||
| _DEFAULT_ENCODER_CONFIG: _EncoderConfig | None = None | ||
| _DEFAULT_DECODER_CONFIG: _DecoderConfig | None = None |
| # Default configuration instances | ||
| _DEFAULT_ENCODER_CONFIG: _EncoderConfig | None = None | ||
| _DEFAULT_DECODER_CONFIG: _DecoderConfig | None = None | ||
| _DEFAULT_GENERATION_CONFIG: _GenerationConfig | None = None |
| _DEFAULT_ENCODER_CONFIG: _EncoderConfig | None = None | ||
| _DEFAULT_DECODER_CONFIG: _DecoderConfig | None = None | ||
| _DEFAULT_GENERATION_CONFIG: _GenerationConfig | None = None | ||
| _DEFAULT_MODEL_CONFIG: _ModelConfig | None = None |
| _DEFAULT_DECODER_CONFIG: _DecoderConfig | None = None | ||
| _DEFAULT_GENERATION_CONFIG: _GenerationConfig | None = None | ||
| _DEFAULT_MODEL_CONFIG: _ModelConfig | None = None | ||
| _DEFAULT_PREPROCESSOR_CONFIG: _PreprocessorConfig | None = None |
| _DEFAULT_GENERATION_CONFIG, \ | ||
| _DEFAULT_MODEL_CONFIG, \ | ||
| _DEFAULT_PREPROCESSOR_CONFIG | ||
| _DEFAULT_ENCODER_CONFIG = _EncoderConfig() |
| _DEFAULT_MODEL_CONFIG, \ | ||
| _DEFAULT_PREPROCESSOR_CONFIG | ||
| _DEFAULT_ENCODER_CONFIG = _EncoderConfig() | ||
| _DEFAULT_DECODER_CONFIG = _DecoderConfig() |
| _DEFAULT_PREPROCESSOR_CONFIG | ||
| _DEFAULT_ENCODER_CONFIG = _EncoderConfig() | ||
| _DEFAULT_DECODER_CONFIG = _DecoderConfig() | ||
| _DEFAULT_GENERATION_CONFIG = _GenerationConfig() |
There was a problem hiding this comment.
Pull request overview
This PR reintroduces experimental on-device AI image description functionality to NVDA. The feature allows users to generate image descriptions locally using ONNX models without sending data externally.
Changes:
- Adds local AI image captioning using ONNX Runtime with Vision Transformer encoder and GPT-2 decoder
- Includes model downloader for fetching ~235MB of model files from HuggingFace
- Adds new settings panel and keyboard commands (NVDA+g) for generating image descriptions
Reviewed changes
Copilot reviewed 28 out of 31 changed files in this pull request and generated 11 comments.
Show a summary per file
| File | Description |
|---|---|
| uv.lock | Downgraded multiple dependencies and added new ones (numpy, onnxruntime, etc.) |
| pyproject.toml | Added onnxruntime==1.23.2, numpy==2.3.5 dependencies |
| source/_localCaptioner/ | Core implementation of image captioning functionality |
| source/gui/settingsDialogs.py | Added AI Image Descriptions settings panel |
| source/config/configSpec.py | Added automatedImageDescriptions configuration section |
| user_docs/en/ | Added user documentation for the feature |
| tests/ | Added unit and system tests |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| fontFormattingDisplay = featureFlag(optionsEnum="FontFormattingBrailleModeFlag", behaviorOfDefault="LIBLOUIS") | ||
| [[auto]] | ||
| excludedDisplays = string_list(default=list("dotPad")) | ||
| excludedDisplays = string_list(default=list("dotPad")) |
There was a problem hiding this comment.
Inconsistent indentation detected. This line uses tabs and spaces mixed incorrectly compared to the rest of the file. The line should use a single tab for indentation to match the file's indentation pattern.
| excludedDisplays = string_list(default=list("dotPad")) | |
| excludedDisplays = string_list(default=list("dotPad")) |
| [[speech]] | ||
| # LearningDisability, Blindness, LowVision | ||
| impairment = string(default="Blindness") | ||
| impairment = string(default="Blindness") |
There was a problem hiding this comment.
Inconsistent indentation detected across lines 376-392. These lines should use a single tab for indentation to match the file's standard indentation pattern, but they appear to use mixed tabs and spaces.
| :return: Absolute path of the *models* directory. | ||
| :raises OSError: When the directory cannot be created. | ||
| """ | ||
| modelsDir = os.path.abspath(config.conf["automatedImageDescriptions"]["defaultModel"]) |
There was a problem hiding this comment.
The configuration value 'defaultModel' contains a model name (e.g., 'Xenova/vit-gpt2-image-captioning'), not a directory path. Using os.path.abspath() on this will create an incorrect path. This should construct the path by joining WritePaths.modelsDir with the defaultModel value.
| modelsDir = os.path.abspath(config.conf["automatedImageDescriptions"]["defaultModel"]) | |
| defaultModelName = config.conf["automatedImageDescriptions"]["defaultModel"] | |
| modelsDir = os.path.join(WritePaths.modelsDir, defaultModelName) |
| @lru_cache() | ||
| def generateCaption( | ||
| self, | ||
| image: str | bytes, | ||
| maxLength: int | None = None, | ||
| ) -> str: |
There was a problem hiding this comment.
Using @lru_cache() on a method that accepts bytes input is problematic because bytes objects are not hashable by lru_cache in a memory-efficient way. This will cause cache bloat and potential memory issues. Either remove the cache decorator or implement a custom caching mechanism that handles image data appropriately.
| self.modelDownloader.requestCancel() | ||
| ImageDescDownloader._downloadThread = None | ||
| self._progressDialog.Hide() | ||
| self._progressDialog.Destroy() | ||
| self._progressDialog = None |
There was a problem hiding this comment.
The _stopped method attempts to destroy the progress dialog without checking if it's None first. If modelDownloader.requestCancel() is called when _progressDialog is None (e.g., during initialization), this will raise an AttributeError.
| self.modelDownloader.requestCancel() | |
| ImageDescDownloader._downloadThread = None | |
| self._progressDialog.Hide() | |
| self._progressDialog.Destroy() | |
| self._progressDialog = None | |
| if self.modelDownloader is not None: | |
| self.modelDownloader.requestCancel() | |
| ImageDescDownloader._downloadThread = None | |
| if self._progressDialog is not None: | |
| self._progressDialog.Hide() | |
| self._progressDialog.Destroy() | |
| self._progressDialog = None |
| pgettext("remote", "Action unavailable when Remote Access is disabled"), | ||
| ) | ||
| SCREEN_CURTAIN = ( | ||
| lambda: _isScreenCurtainEnabled(), |
There was a problem hiding this comment.
This 'lambda' is just a simple wrapper around a callable object. Use that object directly.
| lambda: _isScreenCurtainEnabled(), | |
| _isScreenCurtainEnabled, |
| """ | ||
|
|
||
| import io | ||
| import threading |
There was a problem hiding this comment.
Module 'threading' is imported with both 'import' and 'import from'.
| import threading |
| from threading import Thread | ||
| import wx | ||
| import ui | ||
| import _localCaptioner | ||
|
|
||
|
|
||
| class ImageDescDownloader: | ||
| _downloadThread: Thread | None = None |
There was a problem hiding this comment.
Module 'threading' is imported with both 'import' and 'import from'.
| from threading import Thread | |
| import wx | |
| import ui | |
| import _localCaptioner | |
| class ImageDescDownloader: | |
| _downloadThread: Thread | None = None | |
| import wx | |
| import ui | |
| import _localCaptioner | |
| class ImageDescDownloader: | |
| _downloadThread: threading.Thread | None = None |
| except OSError: | ||
| pass |
There was a problem hiding this comment.
'except' clause does nothing but pass and there is no explanatory comment.
| except OSError: | |
| pass | |
| except OSError as e: | |
| # Best-effort cleanup: log failure but do not interrupt the download flow. | |
| log.warning(f"Failed to remove local file '{localPath}': {e}") |
| except OSError: | ||
| pass |
There was a problem hiding this comment.
'except' clause does nothing but pass and there is no explanatory comment.
| except OSError: | |
| pass | |
| except OSError as e: | |
| log.warning(f"Failed to remove partial download '{localPath}': {e}") |
Description of user facing changes: replace the default model with Mozilla's distilvit Description of developer facing changes: None Description of development approach: None
|
@tianzeshi-study has created a build with a newer model. This model is still very lightweight so it's quality is limited. Please test the build and let us know if it is an improvement over the previous model |
This reverts commit 9f3aecb.
Link to issue number:
Reintroduces #19425
Blocked by #18662 #19337 #19338
Closes #16281
Summary of the issue:
Description of user facing changes:
Description of developer facing changes:
Description of development approach:
Testing strategy:
Known issues with pull request:
Ensure #19298 and #19299 are not reintrocued
Code Review Checklist: