summaryrefslogtreecommitdiffhomepage
path: root/debian/patches/debian-changes
blob: b0a3332ff4c51a62e6b48493216f52be90b21fb7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
The Debian packaging of OCRmyPDF is maintained using dgit.  For the
sake of an efficient workflow, Debian modifications to the upstream
source are squashed into a single diff, rather than a series of quilt
patches.  To obtain a patch queue for package version 1.2.3-1:

    # apt-get install dgit
    % dgit clone ocrmypdf
    % cd ocrmypdf
    % git log --oneline 1.2.3..debian/1.2.3-1 -- . ':!debian'

See dgit(1), dgit(7) and dgit-maint-merge(7) for more information.
--- ocrmypdf-6.2.2.orig/docs/index.rst
+++ ocrmypdf-6.2.2/docs/index.rst
@@ -16,7 +16,6 @@ PDF is the best format for storing and e
 
    introduction
    release_notes
-   installation
    languages
 
 .. toctree::
--- ocrmypdf-6.2.2.orig/docs/languages.rst
+++ ocrmypdf-6.2.2/docs/languages.rst
@@ -7,10 +7,7 @@ OCRmyPDF uses Tesseract for OCR, and rel
 
 Tesseract supports `most languages <https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages>`_.
 
-For Linux users, you can often find packages that provide language packs:
-
-Debian and Ubuntu users
------------------------
+You can often find packages that provide language packs:
 
 .. code-block:: bash
 
@@ -23,44 +20,6 @@ Debian and Ubuntu users
 You can then pass the ``-l LANG`` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple
 languages can be requested using either ``-l eng+fre`` (English and French) or ``-l eng -l fre``.
 
-macOS users
------------
-
-You can install additional language packs by :ref:`installing Tesseract using Homebrew with all language packs <macos-all-languages>`.
-
-Docker users
-------------
-
-Users of the Docker image may use the alternative :ref:`"polyglot" container <docker-polyglot>` which includes all languages.
-
-Adding individual language packs to a Docker image
-""""""""""""""""""""""""""""""""""""""""""""""""""
-
-If you wish to add a single language pack, you could do the following:
-
-* Download the desired ``.trainedata`` file from the `tessdata <https://github.com/tesseract-ocr/tessdata>`_ repository. Let's use Hebrew in this example (``heb.traineddata``)
-
-* Copy the file to ``/home/user/downloads/heb.traineddata``.
-
-* Create a new container based on the ocrmypdf-tess4 image and jump into it with a terminal:
-
-.. code-block:: bash
-
-	host$ docker run  -v /home/user/downloads:/home/docker -it --entrypoint /bin/bash ocrmypdf-tess4
-
-* Put the file where Tesseract expects it:
-
-.. code-block:: bash
-
-	docker$ cp /home/docker/heb.traineddata /usr/share/tesseract-ocr/tessdata
-
-* Note the container id, and save it as a new image (in this example, ``ocrmypdf-tess4-heb``)
-
-.. code-block:: bash
-
-    host$ docker commit <container_id> ocrmypdf-tess4-heb
-
-
 Known limitations
 -----------------
 
@@ -73,4 +32,4 @@ As of v4.2, users of ocrmypdf working la
 The reasons for this are:
 
 * The latest version of Ghostscript (9.19 as of this writing) has unfixed bugs in Unicode handling that generate invalid character maps, so Ghostscript cannot be used for PDF/A conversion
-* The default "hocr" PDF renderer does not handle Asian fonts properly
\ No newline at end of file
+* The default "hocr" PDF renderer does not handle Asian fonts properly
--- ocrmypdf-6.2.2.orig/src/ocrmypdf/__main__.py
+++ ocrmypdf-6.2.2/src/ocrmypdf/__main__.py
@@ -70,7 +70,7 @@ def complain(message):
 if 'IDE_PROJECT_ROOTS' in os.environ:
     os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH']
 
-# --------
+# -------- 
 # Critical environment tests
 
 verify_python3_env()
@@ -138,17 +138,17 @@ your PDF, use --output-type pdf.
 
 If OCRmyPDF is given an image file as input, it will attempt to convert the
 image to a PDF before processing.  For more control over the conversion of
-images to PDF, use the Python package img2pdf or other image to PDF software.
+images to PDF, use img2pdf, or other image to PDF software.
 
 For example, this command uses img2pdf to convert all .png files beginning
 with the 'page' prefix to a PDF, fitting each image on A4-sized paper, and
-sending the result to OCRmyPDF through a pipe.  img2pdf is a dependency of
-ocrmypdf so it is already installed.
+sending the result to OCRmyPDF through a pipe.
 
     img2pdf --pagesize A4 page*.png | ocrmypdf - myfile.pdf
 
-Online documentation is located at:
-    https://ocrmypdf.readthedocs.io/en/latest/introduction.html
+HTML documentation is located at:
+    /usr/share/doc/ocrmypdf/html/index.html
+after installing the ocrmypdf-doc package.
 
 """)
 
@@ -170,7 +170,7 @@ parser.add_argument(
     '--image-dpi', metavar='DPI', type=int,
     help="For input image instead of PDF, use this DPI instead of file's.")
 parser.add_argument(
-    '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
+    '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'], 
     default='pdfa',
     help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
          "long term archiving (default, recommended) but may not suitable "
@@ -314,7 +314,7 @@ advanced.add_argument(
     help='Give up on OCR after the timeout, but copy the preprocessed page '
          'into the final output')
 advanced.add_argument(
-    '--rotate-pages-threshold', default=14.0, type=numeric(float, max_=1000), metavar='CONFIDENCE',
+    '--rotate-pages-threshold', default=14.0, type=numeric(float, 1000), metavar='CONFIDENCE',
     help="Only rotate pages when confidence is above this value (arbitrary "
          "units reported by tesseract)")
 advanced.add_argument(
@@ -504,7 +504,7 @@ def check_options_advanced(options, log)
             "--pdfa-image-compression argument has no effect when "
             "--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'"
         )
-
+    
     if tesseract.v4() and (options.user_words or options.user_patterns):
         log.warning(
             'Tesseract 4.x ignores --user-words, so this has no effect')
@@ -592,7 +592,7 @@ def do_ruffus_exception(ruffus_five_tupl
     if exc_name == 'builtins.SystemExit':
         match = re.search(r"\.(.+?)\)", exc_value)
         exit_code_name = match.groups()[0]
-        exit_code = getattr(ExitCode, exit_code_name, 'other_error')
+        exit_code = getattr(ExitCode, exit_code_name, 'other_error')        
     elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
         log.error(cleanup_ruffus_error_message(exc_value))
         exit_code = ExitCode.input_file
@@ -616,7 +616,7 @@ def do_ruffus_exception(ruffus_five_tupl
             (exc_name == 'ocrmypdf.exceptions.EncryptedPdfError'):
         log.error(textwrap.dedent("""\
             Input PDF is encrypted. The encryption must be removed to
-            perform OCR.
+            perform OCR. 
 
             For information about this PDF's security use
                 qpdf --show-encryption infilename
@@ -625,7 +625,7 @@ def do_ruffus_exception(ruffus_five_tupl
                 qpdf --decrypt [--password=[password]] infilename
 
             """))
-        exit_code = ExitCode.encrypted_pdf
+        exit_code = ExitCode.encrypted_pdf        
     elif exc_name == 'ocrmypdf.exceptions.PdfMergeFailedError':
         log.error(textwrap.dedent("""\
             Failed to merge PDF image layer with OCR layer
@@ -656,31 +656,33 @@ def do_ruffus_exception(ruffus_five_tupl
     return ExitCode.other_error
 
 
-def traverse_ruffus_exception(exceptions, options, log):
-    """Traverse a RethrownJobError and output the exceptions
+def traverse_ruffus_exception(e_args, options, log):
+    """Walk through a RethrownJobError and find the first exception.
 
-    Ruffus presents exceptions as 5 element tuples. The RethrownJobException
-    has a list of exceptions like
-        e.job_exceptions = [(5-tuple), (5-tuple), ...]
-
-    ruffus < 2.7.0 had a bug with exception marshalling that would give
-    different output whether the main or child process raised the exception.
-    We no longer support this.
-
-    Attempting to log the exception itself will re-marshall it to the logger
-    which is normally running in another process. It's better to avoid re-
-    marshalling.
+    Ruffus flattens exception to 5 element tuples. Because of a bug
+    in <= 2.6.3 it may present either the single:
+      (task, job, exc, value, stack)
+    or something like:
+      [[(task, job, exc, value, stack)]]
+    
+    Generally cross-process exception marshalling doesn't work well
+    and ruffus doesn't support because BaseException has its own
+    implementation of __reduce__ that attempts to reconstruct the
+    exception based on e.__init__(e.args).
+    
+    Attempting to log the exception directly marshalls it to the logger
+    which is probably in another process, so it's better to log only
+    data from the exception at this point.
 
     The exit code will be based on this, even if multiple exceptions occurred
     at the same time."""
 
-    exit_codes = []
-    for exc in exceptions:
-        exit_code = do_ruffus_exception(exc, options, log)
-        exit_codes.append(exit_code)
-
-    return exit_codes[0]  # Multiple codes are rare so take the first one
-
+    if isinstance(e_args, Sequence) and isinstance(e_args[0], str) and \
+            len(e_args) == 5:
+        return do_ruffus_exception(e_args, options, log)
+    elif is_iterable_notstr(e_args):
+        for exc in e_args:
+            return traverse_ruffus_exception(exc, options, log)
 
 
 def check_closed_streams(options):
@@ -765,7 +767,7 @@ def check_environ(options, _log):
     for k in old_envvars:
         if k in os.environ:
             _log.warning(textwrap.dedent("""\
-                OCRmyPDF no longer uses the environment variable {}.
+                OCRmyPDF no longer uses the environment variable {}. 
                 Change PATH to select alternate programs.""".format(k)))
 
 
@@ -808,14 +810,14 @@ def report_output_file_size(options, _lo
     ratio = output_size / input_size
     if ratio < 1.35 or input_size < 25000:
         return  # Seems fine
-
+    
     reasons = []
     if not fitz:
         reasons.append("The optional dependency PyMuPDF is not installed.")
     image_preproc = {
-        'deskew',
-        'clean_final',
-        'remove_background',
+        'deskew', 
+        'clean_final', 
+        'remove_background', 
         'oversample',
         'force_ocr'
     }
@@ -902,8 +904,7 @@ def run_pipeline():
     except ruffus_exceptions.RethrownJobError as e:
         if options.verbose:
             _log.debug(str(e))  # stringify exception so logger doesn't have to
-        exceptions = e.job_exceptions
-        exitcode = traverse_ruffus_exception(exceptions, options, _log)
+        exitcode = traverse_ruffus_exception(e.args, options, _log)
         if exitcode is None:
             _log.error("Unexpected ruffus exception: " + str(e))
             _log.error(repr(e))
@@ -936,7 +937,7 @@ def run_pipeline():
             _log.warning('Output file: The generated PDF is INVALID')
             return ExitCode.invalid_output_pdf
 
-        report_output_file_size(options, _log, start_input_file,
+        report_output_file_size(options, _log, start_input_file, 
                                 options.output_file)
 
     pdfinfo = context.get_pdfinfo()