1# ...
2#
3# Copyright (c) 2019 Philippe Mathieu-Daudé <f4bug@amsat.org>
4#
5# This work is licensed under the terms of the GNU GPL, version 2 or
6# later. See the COPYING file in the top-level directory.
7
8import re
9import logging
10
11from avocado.utils import process
12from avocado.utils.path import find_command, CmdNotFoundError
13
14def tesseract_available(expected_version):
15    try:
16        find_command('tesseract')
17    except CmdNotFoundError:
18        return False
19    res = process.run('tesseract --version')
20    try:
21        version = res.stdout_text.split()[1]
22    except IndexError:
23        version = res.stderr_text.split()[1]
24    return int(version.split('.')[0]) >= expected_version
25
26    match = re.match(r'tesseract\s(\d)', res)
27    if match is None:
28        return False
29    # now this is guaranteed to be a digit
30    return int(match.groups()[0]) >= expected_version
31
32
33def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
34    console_logger = logging.getLogger('tesseract')
35    console_logger.debug(image_path)
36    if tesseract_version == 4:
37        tesseract_args += ' --oem 1'
38    proc = process.run("tesseract {} {} stdout".format(tesseract_args,
39                                                       image_path))
40    lines = []
41    for line in proc.stdout_text.split('\n'):
42        sline = line.strip()
43        if len(sline):
44            console_logger.debug(sline)
45            lines += [sline]
46    return lines
47