esl-reader/lib/books.py

import json
import os
import xml.dom.minidom


class Book:
    sections = []
    metadata = {}

    def __init__(self, src: str) -> None:
        super(Book, self).__init__()
        self.parse_book(src)
        return

    def parse_book(self, src: str) -> None:
        with open(f"{src}/content.opf", "r") as f:
            dom = xml.dom.minidom.parse(f)
        metadata = dom.getElementsByTagName("metadata")[0]
        for meta in metadata.childNodes:
            if meta.nodeType != xml.dom.Node.ELEMENT_NODE:
                continue
            if meta.prefix == "dc":
                self.metadata[meta.localName] = meta.firstChild.data
        #
        # The manifest contains a list of all the files contained in this
        # EPUB
        #
        manifest = dom.getElementsByTagName("manifest")[0]
        #
        # The spine contains the documents in order they are read
        #
        spine = dom.getElementsByTagName("spine")[0]
        for itemref in spine.childNodes:
            if itemref.nodeType != xml.dom.Node.ELEMENT_NODE:
                continue
            # If linear == "no" skip
            if (
                itemref.hasAttribute("linear")
                and itemref.getAttribute("linear") == "no"
            ):
                continue
            idref = itemref.getAttribute("idref")
            for item in manifest.childNodes:
                if item.nodeType != xml.dom.Node.ELEMENT_NODE:
                    continue
                id = item.getAttribute("id")
                if id == idref:
                    break
            #
            # A properly created *.opf will always have a correct
            # spin/manifest.
            #
            href = item.getAttribute("href")
            print(f"{idref}: {href}")
            self.parse_section(src, href)
        return

    def parse_section(self, src: str, href: str) -> None:
        with open(f"{src}/{href}") as f:
            dom = xml.dom.minidom.parse(f)
        title = dom.getElementsByTagName("title")[0].firstChild.data
        body = dom.getElementsByTagName("body")[0]
        paragraphs = []
        for p in body.getElementsByTagName("p"):
            paragraphs.append(p.toxml())
        self.sections.append({"title": title, "paragraphs": paragraphs})
        return