Source code for cioprocessor.lib.epubize

"""ePub Generation."""

from os import walk
from os.path import exists, join, basename, relpath, dirname
from re import compile as re_compile, match, findall
from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile
from subprocess import PIPE, Popen

from lxml import etree

from chrysalio.lib.utils import mimetype_get
from chrysalio.lib.utils import load_guessing_encoding, make_digest
from chrysalio.lib.xml import load_xml
from .i18n import _


CONTAINER_NS = 'urn:oasis:names:tc:opendocument:xmlns:container'
OPF_NS = 'http://www.idpf.org/2007/opf'
REMOVE_PATTERN = r'(~|\.tmp)(\.\w{1,4})?$'


# =============================================================================
[docs]class EPubize():
    """Class to transform a directory into an ePub."""

    # -------------------------------------------------------------------------
[docs]    def convert(self, step, epub_dir, epub_file):
        """Convert a dictionary into an ePub according to ``step``.

        :param dict step:
            Dictionary representing step of type `epubize`.
        :param str epub_dir:
            Absolute path to the directory representing the ePub.
        :param str epub_file:
            Absolute path to the ePub to create.
        :rtype: :class:`pyramid.i18n.TranslationString` or ``None``
        """
        # Check the structure
        if not exists(join(epub_dir, 'mimetype')) \
           or not exists(join(epub_dir, 'META-INF', 'container.xml')):
            return _('Incorrect OCF structure')
        tree = load_xml(join(epub_dir, 'META-INF', 'container.xml'))
        opf_file = tree.xpath('/*/*/ns:rootfile/@full-path', namespaces={
            'ns': CONTAINER_NS})
        if not opf_file:
            return _('Unable to find OPF file')
        opf_file = join(epub_dir, opf_file[0])

        # Update manifest
        if step.get('complete-manifest') == 'true':
            error = self._update_manifest(epub_dir, opf_file)
            if error is not None:
                return error

        # Update size
        if step.get('fixed') == 'true':
            error = self._update_image_size(epub_dir, opf_file)
            if error is not None:
                return error

        # Create ZIP file
        exclude = re_compile(REMOVE_PATTERN)
        with ZipFile(epub_file, 'w', ZIP_DEFLATED) as zip_file:
            zip_file.write(join(epub_dir, 'mimetype'), 'mimetype', ZIP_STORED)
            for path, dirs, files in walk(epub_dir):
                for name in tuple(dirs):
                    if exclude.search(name):
                        dirs.remove(name)
                for name in files:
                    if not exclude.search(name) and name != 'mimetype':
                        try:
                            zip_file.write(
                                join(path, name),
                                relpath(join(path, name), epub_dir))
                        except OSError:
                            pass
        return None

    # -------------------------------------------------------------------------
    @classmethod
    def _update_manifest(cls, epub_dir, opf_file):
        """Update file list in manifest tag.

        :param str epub_dir:
            Absolute path to the directory representing the ePub.
        :param str opf_file:
            Absolute path to the OPF file.
        :rtype: :class:`pyramid.i18n.TranslationString` or ``None``
        """
        # pylint: disable = too-many-locals
        # Find the manifest element
        tree = load_xml(
            opf_file, parser=etree.XMLParser(remove_blank_text=True))
        # pylint: disable = protected-access
        if not isinstance(tree, etree._ElementTree):
            return tree
        # pylint: enable = protected-access
        manifest_elt = tree.xpath(
            '/*/opf:manifest', namespaces={'opf': OPF_NS})
        if not manifest_elt:
            return _('Manifest is missing.')
        manifest_elt = manifest_elt[0]

        # Browse declared files
        root = dirname(opf_file)
        done = set([
            basename(opf_file), relpath(join(epub_dir, 'mimetype'), root)])
        for elt in manifest_elt.iterchildren(tag=etree.Element):
            done.add(elt.get('href'))

        # Browse real files
        modified = False
        exclude = re_compile(REMOVE_PATTERN)
        for path, dirs, files in walk(epub_dir):
            for name in tuple(dirs):
                if name == 'META-INF':
                    dirs.remove(name)
            for name in files:
                relname = relpath(join(path, name), root)
                if relname in done or exclude.search(name):
                    continue
                if not modified:
                    manifest_elt.append(etree.Comment('OTHER'))
                elt = etree.SubElement(manifest_elt, 'item')
                elt.set('id', 'x_{0}'.format(make_digest(relname)))
                elt.set('href', relname)
                elt.set('media-type', mimetype_get(join(path, name))[0])
                modified = True

        # Save modified file
        if modified:
            tree.write(
                opf_file, encoding='utf-8', xml_declaration=True,
                pretty_print=True)

        return None

    # -------------------------------------------------------------------------
    def _update_image_size(self, epub_dir, opf_file):
        """Detect the size of images in a fixed-layout ePub and replace
        height and width in the files.

        :param str epub_dir:
            Absolute path to the directory representing the ePub.
        :param str opf_file:
            Absolute path to the OPF file.
        :rtype: :class:`pyramid.i18n.TranslationString` or ``None``
        """
        # Compute height and width of ePub
        size, error = self._find_image_size(opf_file)
        if error is not None:
            return error
        if size['width'] == '0' or size['height'] == '0':
            return _('no image to figure out the ePub size')

        # Replace variables and SVG calls
        for root, ignored_, files in walk(epub_dir):
            for name in [
                    k for k in files if match('.+\\.(x?html|opf|css)', k)]:
                modified = False
                content = load_guessing_encoding(join(root, name))

                # Replace ${...}
                for item in findall(r'\$\{([^}]+)\}', content):
                    content = content.replace(
                        '${{{0}}}'.format(item), size.get(item, ''))
                    modified = True

                if modified:
                    with open(join(root, name), 'wb') as hdl:
                        hdl.write(content.encode('utf8'))

        return None

    # -------------------------------------------------------------------------
    @classmethod
    def _find_image_size(cls, opf_file):
        """Find size of the biggest image.

        :param str opf_file:
            Absolute path to the OPF file.
        :rtype: tuple
        :return:
            A tuple such as ``(size, error)`` where ``size`` is a dictionary
            with keys ``'height'`` and ``'width'``.
        """
        # Find the manifest element
        tree = load_xml(opf_file)
        # pylint: disable = protected-access
        if not isinstance(tree, etree._ElementTree):
            return {'width': '0', 'height': '0'}, tree
        # pylint: enable = protected-access
        manifest_elt = tree.xpath(
            '/*/opf:manifest', namespaces={'opf': OPF_NS})[0]

        # Browse images
        width = 0
        height = 0
        root = dirname(opf_file)
        for elt in manifest_elt.iterchildren(tag=etree.Element):
            if 'image' not in elt.get('media-type') \
               or 'cover-image' in elt.get('properties', ''):
                continue

            try:
                with Popen(
                        ['nice', 'identify', '-format', '%w %h',
                         join(root, elt.get('href'))],
                        stdout=PIPE, stderr=PIPE) as proc:
                    result = proc.communicate()[0]
            except OSError:
                continue

            if not result:
                continue

            if width == 0 or width < abs(int(result.split()[0])):
                width = abs(int(result.split()[0]))
            if height == 0 or height < abs(int(result.split()[1])):
                height = int(result.split()[1])

        return {'width': str(width), 'height': str(height)}, None