summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorJames R. Barlow <james@purplerock.ca>2022-08-01 02:04:09 -0700
committerJames R. Barlow <james@purplerock.ca>2022-08-01 02:04:09 -0700
commitba372e58415944c9679f7b01abd04924e3f06f8f (patch)
tree91c61f68f545dce35e998ac68ba29bd30cd4f63a
parentc7fcbe9075e05a6e75102b2a8cb4d2375e379921 (diff)
downloadocrmypdf-ba372e58415944c9679f7b01abd04924e3f06f8f.tar.gz
Reorganize validation to fix exception when Tesseract not installed
The existing logic would call an OCR plugin's get_languages function before allowing the plugin to check if its dependencies were available. This caused an exception if Tesseract was installed, when we were supposed to issue an error message advising the user to install Tesseract.
-rw-r--r--src/ocrmypdf/_validation.py42
-rw-r--r--tests/test_validation.py33
2 files changed, 42 insertions, 33 deletions
diff --git a/src/ocrmypdf/_validation.py b/src/ocrmypdf/_validation.py
index 544f11cb..0533b928 100644
--- a/src/ocrmypdf/_validation.py
+++ b/src/ocrmypdf/_validation.py
@@ -11,12 +11,14 @@ import logging
import os
import sys
import unicodedata
+from argparse import Namespace
from pathlib import Path
from shutil import copyfileobj
from typing import Sequence
import pikepdf
import PIL
+from pluggy import PluginManager
from ocrmypdf._exec import unpaper
from ocrmypdf.exceptions import (
@@ -40,7 +42,7 @@ log = logging.getLogger(__name__)
# --------
-def check_platform():
+def check_platform() -> None:
if os.name == 'nt' and sys.maxsize <= 2**32: # pragma: no cover
# 32-bit interpreter on Windows
log.error(
@@ -49,7 +51,7 @@ def check_platform():
)
-def check_options_languages(options, ocr_engine_languages):
+def check_options_languages(options: Namespace, ocr_engine_languages: set[str]) -> None:
if not options.languages:
options.languages = {DEFAULT_LANGUAGE}
system_lang = locale.getlocale()[0]
@@ -68,7 +70,7 @@ def check_options_languages(options, ocr_engine_languages):
raise MissingDependencyError(msg)
-def check_options_output(options):
+def check_options_output(options: Namespace) -> None:
is_latin = options.languages.issubset(HOCR_OK_LANGS)
if options.pdf_renderer.startswith('hocr') and not is_latin:
@@ -104,7 +106,7 @@ def check_options_output(options):
)
-def check_options_sidecar(options):
+def check_options_sidecar(options: Namespace) -> None:
if options.sidecar == '\0':
if options.output_file == '-':
raise BadArgsError(
@@ -121,7 +123,7 @@ def check_options_sidecar(options):
)
-def check_options_preprocessing(options):
+def check_options_preprocessing(options: Namespace) -> None:
if options.clean_final:
options.clean = True
if options.unpaper_args and not options.clean:
@@ -182,7 +184,7 @@ def _pages_from_ranges(ranges: str) -> set[int]:
return set(pages)
-def check_options_ocr_behavior(options):
+def check_options_ocr_behavior(options: Namespace) -> None:
exclusive_options = sum(
(1 if opt else 0)
for opt in (options.force_ocr, options.skip_text, options.redo_ocr)
@@ -193,7 +195,7 @@ def check_options_ocr_behavior(options):
options.pages = _pages_from_ranges(options.pages)
-def check_options_advanced(options):
+def check_options_advanced(options: Namespace) -> None:
if options.pdfa_image_compression != 'auto' and not options.output_type.startswith(
'pdfa'
):
@@ -203,7 +205,7 @@ def check_options_advanced(options):
)
-def check_options_metadata(options):
+def check_options_metadata(options: Namespace) -> None:
docinfo = [options.title, options.author, options.keywords, options.subject]
for s in (m for m in docinfo if m):
for char in s:
@@ -216,15 +218,14 @@ def check_options_metadata(options):
)
-def check_options_pillow(options):
+def check_options_pillow(options: Namespace) -> None:
PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
if PIL.Image.MAX_IMAGE_PIXELS == 0:
PIL.Image.MAX_IMAGE_PIXELS = None
-def _check_options(options, plugin_manager, ocr_engine_languages):
+def _check_plugin_invariant_options(options: Namespace) -> None:
check_platform()
- check_options_languages(options, ocr_engine_languages)
check_options_metadata(options)
check_options_output(options)
check_options_sidecar(options)
@@ -232,15 +233,20 @@ def _check_options(options, plugin_manager, ocr_engine_languages):
check_options_ocr_behavior(options)
check_options_advanced(options)
check_options_pillow(options)
- plugin_manager.hook.check_options(options=options)
-def check_options(options, plugin_manager):
+def _check_plugin_options(options: Namespace, plugin_manager: PluginManager) -> None:
+ plugin_manager.hook.check_options(options=options)
ocr_engine_languages = plugin_manager.hook.get_ocr_engine().languages(options)
- _check_options(options, plugin_manager, ocr_engine_languages)
+ check_options_languages(options, ocr_engine_languages)
+
+
+def check_options(options: Namespace, plugin_manager: PluginManager) -> None:
+ _check_plugin_invariant_options(options)
+ _check_plugin_options(options, plugin_manager)
-def create_input_file(options, work_folder: Path) -> tuple[Path, str]:
+def create_input_file(options: Namespace, work_folder: Path) -> tuple[Path, str]:
if options.input_file == '-':
# stdin
log.info('reading file from standard input')
@@ -275,7 +281,7 @@ def create_input_file(options, work_folder: Path) -> tuple[Path, str]:
raise InputFileError(msg) from e
-def check_requested_output_file(options):
+def check_requested_output_file(options: Namespace) -> None:
if options.output_file == '-':
if sys.stdout.isatty():
raise BadArgsError(
@@ -293,13 +299,13 @@ def check_requested_output_file(options):
def report_output_file_size(
- options,
+ options: Namespace,
input_file: Path,
output_file: Path,
optimize_messages: Sequence[str] | None = None,
file_overhead: int = 4000,
page_overhead: int = 3000,
-):
+) -> None:
if optimize_messages is None:
optimize_messages = []
try:
diff --git a/tests/test_validation.py b/tests/test_validation.py
index cc836ea4..03473704 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -44,27 +44,24 @@ def make_opts(*args, **kwargs):
def test_hocr_notlatin_warning(caplog):
# Bypass the test to see if the language is installed; we just want to pretend
# that a non-Latin language is installed
- vd._check_options(
+ vd.check_options(
*make_opts_pm(language='chi_sim', pdf_renderer='hocr', output_type='pdfa'),
- {'chi_sim'},
)
assert 'PDF renderer is known to cause' in caplog.text
def test_old_ghostscript(caplog):
with patch('ocrmypdf._exec.ghostscript.version', return_value='9.19'):
- vd._check_options(
- *make_opts_pm(language='chi_sim', output_type='pdfa'), {'chi_sim'}
- )
+ vd.check_options(*make_opts_pm(language='chi_sim', output_type='pdfa'))
assert 'does not work correctly' in caplog.text
with patch('ocrmypdf._exec.ghostscript.version', return_value='9.18'):
with pytest.raises(MissingDependencyError):
- vd._check_options(*make_opts_pm(output_type='pdfa-3'), set())
+ vd.check_options(*make_opts_pm(output_type='pdfa-3'))
with patch('ocrmypdf._exec.ghostscript.version', return_value='9.24'):
with pytest.raises(MissingDependencyError):
- vd._check_options(*make_opts_pm(), set())
+ vd.check_options(*make_opts_pm())
def test_old_tesseract_error():
@@ -72,7 +69,7 @@ def test_old_tesseract_error():
with pytest.raises(MissingDependencyError):
opts = make_opts(pdf_renderer='sandwich', language='eng')
plugin_manager = get_plugin_manager(opts.plugins)
- vd._check_options(opts, plugin_manager, {'eng'})
+ vd.check_options(opts, plugin_manager)
def test_lossless_redo():
@@ -92,7 +89,7 @@ def test_mutex_options():
def test_optimizing(caplog):
opts = make_opts(optimize=0, jbig2_lossy=True, png_quality=18, jpeg_quality=10)
plugin_manager = get_plugin_manager(opts.plugins)
- vd._check_options(opts, plugin_manager, set())
+ vd.check_options(opts, plugin_manager)
assert 'will be ignored because' in caplog.text
@@ -100,7 +97,7 @@ def test_user_words(caplog):
with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=False):
opts = make_opts(user_words='foo')
plugin_manager = get_plugin_manager(opts.plugins)
- vd._check_options(opts, plugin_manager, set())
+ vd.check_options(opts, plugin_manager)
assert (
'Tesseract 4.0 (which you have installed) ignores --user-words'
in caplog.text
@@ -109,7 +106,7 @@ def test_user_words(caplog):
with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=True):
opts = make_opts(user_patterns='foo')
plugin_manager = get_plugin_manager(opts.plugins)
- vd._check_options(opts, plugin_manager, set())
+ vd.check_options(opts, plugin_manager)
assert (
'Tesseract 4.0 (which you have installed) ignores --user-words'
not in caplog.text
@@ -169,7 +166,7 @@ def test_no_progress_bar(progress_bar, resources):
opts = make_opts(progress_bar=progress_bar, input_file=(resources / 'trivial.pdf'))
plugin_manager = get_plugin_manager(opts.plugins)
- vd._check_options(opts, plugin_manager, set())
+ vd.check_options(opts, plugin_manager)
pbar_disabled = None
@@ -290,13 +287,19 @@ def test_optional_program_recommended(caplog):
def test_pagesegmode_warning(caplog):
opts = make_opts(tesseract_pagesegmode='0')
plugin_manager = get_plugin_manager(opts.plugins)
- vd._check_options(opts, plugin_manager, set())
+ vd.check_options(opts, plugin_manager)
assert 'disable OCR' in caplog.text
def test_two_languages():
- vd._check_options(
- *make_opts_pm(language='fakelang1+fakelang2'), {'fakelang1', 'fakelang2'}
+ vd.check_options_languages(
+ create_options(
+ input_file='a.pdf',
+ output_file='b.pdf',
+ parser=get_parser(),
+ language='fakelang1+fakelang2',
+ ),
+ {'fakelang1', 'fakelang2'},
)