import json import os import xml.dom.minidom class Book: sections = [] metadata = {} def __init__(self, src: str) -> None: super(Book, self).__init__() self.parse_book(src) return def parse_book(self, src: str) -> None: with open(f"{src}/content.opf", "r") as f: dom = xml.dom.minidom.parse(f) metadata = dom.getElementsByTagName("metadata")[0] for meta in metadata.childNodes: if meta.nodeType != xml.dom.Node.ELEMENT_NODE: continue if meta.prefix == "dc": self.metadata[meta.localName] = meta.firstChild.data # # The manifest contains a list of all the files contained in this # EPUB # manifest = dom.getElementsByTagName("manifest")[0] # # The spine contains the documents in order they are read # spine = dom.getElementsByTagName("spine")[0] for itemref in spine.childNodes: if itemref.nodeType != xml.dom.Node.ELEMENT_NODE: continue # If linear == "no" skip if ( itemref.hasAttribute("linear") and itemref.getAttribute("linear") == "no" ): continue idref = itemref.getAttribute("idref") for item in manifest.childNodes: if item.nodeType != xml.dom.Node.ELEMENT_NODE: continue id = item.getAttribute("id") if id == idref: break # # A properly created *.opf will always have a correct # spin/manifest. # href = item.getAttribute("href") print(f"{idref}: {href}") self.parse_section(src, href) return def parse_section(self, src: str, href: str) -> None: with open(f"{src}/{href}") as f: dom = xml.dom.minidom.parse(f) title = dom.getElementsByTagName("title")[0].firstChild.data body = dom.getElementsByTagName("body")[0] paragraphs = [] for p in body.getElementsByTagName("p"): paragraphs.append(p.toxml()) self.sections.append({"title": title, "paragraphs": paragraphs}) return