Working checkpoint

2023-11-10 21:48:15 -05:00
parent e9dbadb5e3
commit 598201425c
16 changed files with 1132 additions and 0 deletions
--- a/lib/books.py
+++ b/lib/books.py
@@ -0,0 +1,67 @@
+import json
+import os
+import xml.dom.minidom
+
+
+class Book:
+    sections = []
+    metadata = {}
+
+    def __init__(self, src: str) -> None:
+        super(Book, self).__init__()
+        self.parse_book(src)
+        return
+
+    def parse_book(self, src: str) -> None:
+        with open(f"{src}/content.opf", "r") as f:
+            dom = xml.dom.minidom.parse(f)
+        metadata = dom.getElementsByTagName("metadata")[0]
+        for meta in metadata.childNodes:
+            if meta.nodeType != xml.dom.Node.ELEMENT_NODE:
+                continue
+            if meta.prefix == "dc":
+                self.metadata[meta.localName] = meta.firstChild.data
+        #
+        # The manifest contains a list of all the files contained in this
+        # EPUB
+        #
+        manifest = dom.getElementsByTagName("manifest")[0]
+        #
+        # The spine contains the documents in order they are read
+        #
+        spine = dom.getElementsByTagName("spine")[0]
+        for itemref in spine.childNodes:
+            if itemref.nodeType != xml.dom.Node.ELEMENT_NODE:
+                continue
+            # If linear == "no" skip
+            if (
+                itemref.hasAttribute("linear")
+                and itemref.getAttribute("linear") == "no"
+            ):
+                continue
+            idref = itemref.getAttribute("idref")
+            for item in manifest.childNodes:
+                if item.nodeType != xml.dom.Node.ELEMENT_NODE:
+                    continue
+                id = item.getAttribute("id")
+                if id == idref:
+                    break
+            #
+            # A properly created *.opf will always have a correct
+            # spin/manifest.
+            #
+            href = item.getAttribute("href")
+            print(f"{idref}: {href}")
+            self.parse_section(src, href)
+        return
+
+    def parse_section(self, src: str, href: str) -> None:
+        with open(f"{src}/{href}") as f:
+            dom = xml.dom.minidom.parse(f)
+        title = dom.getElementsByTagName("title")[0].firstChild.data
+        body = dom.getElementsByTagName("body")[0]
+        paragraphs = []
+        for p in body.getElementsByTagName("p"):
+            paragraphs.append(p.toxml())
+        self.sections.append({"title": title, "paragraphs": paragraphs})
+        return