import json import os import xml.dom.minidom from typing import Dict, List, cast from PyQt6.QtSql import QSqlQuery from main import query_error class Book: sections: List[str] = [] metadata: Dict[str, str] = {} def __init__(self, src: str) -> None: super(Book, self).__init__() self.parse_book(src) book_id = self.store() # Does nothing if already in database self.load(book_id) return def load(self, book_id: int) -> None: query = QSqlQuery() query.prepare("SELECT * FROM books where book_id = :book_id") query.bindValue(":book_id", book_id) if not query.exec(): query_error(query) if not query.next(): raise Exception(f"Missing book? book_id={book_id}") self.metadata = { "title": query.value("title"), "creator": query.value("author"), "identifier": query.value("uuid"), "level": query.value("level"), } self.sections = [] query.prepare( "SELECT * FORM sections WHERE book_id = :book_id " "ORDER BY sequence" ) while query.next(): self.sections.append(query.value("contents")) # # Load words! # return def parse_book(self, src: str) -> None: with open(f"{src}/content.opf", "r") as f: dom = xml.dom.minidom.parse(f) metadata = dom.getElementsByTagName("metadata")[0] for meta in metadata.childNodes: if meta.nodeType != xml.dom.Node.ELEMENT_NODE: continue if meta.prefix == "dc": self.metadata[meta.localName] = meta.firstChild.data # # The manifest contains a list of all the files contained in this # EPUB # manifest = dom.getElementsByTagName("manifest")[0] # # The spine contains the documents in order they are read # spine = dom.getElementsByTagName("spine")[0] for itemref in spine.childNodes: if itemref.nodeType != xml.dom.Node.ELEMENT_NODE: continue # If linear == "no" skip if ( itemref.hasAttribute("linear") and itemref.getAttribute("linear") == "no" ): continue idref = itemref.getAttribute("idref") for item in manifest.childNodes: if item.nodeType != xml.dom.Node.ELEMENT_NODE: continue id = item.getAttribute("id") if id == idref: break # # A properly created *.opf will always have a correct # spin/manifest. # href = item.getAttribute("href") print(f"{idref}: {href}") self.parse_section(src, href) # # "sections" is now loaded # return def store(self) -> int: uuid = self.metadata["identifier"] query = QSqlQuery() query.prepare( "SELECT COUNT(*) AS number, book_id FROM books b " "WHERE b.uuid = :uuid" ) query.bindValue(":uuid", uuid) if not query.exec(): query_error(query) query.next() if query.value("number") > 0: book_id: int = query.value("book_id") return book_id query.prepare( "INSERT INTO books (title, author, uuid, level) VALUES (" ":title, :author, :uuid, 0)" ) query.bindValue(":title", self.metadata["title"]) query.bindValue(":author", self.metadata["creator"]) query.bindValue(":uuid", uuid) if not query.exec(): query_error(query) book_id = query.lastInsertId() query.prepare( "INSERT INTO sections (sequence, book_id, content) " "VALUES (:sequence, :book_id, :content)" ) query.bindValue(":book_id", book_id) for seq, section in enumerate(self.sections): query.bindValue(":sequence", seq) query.bindValue(":content", section) if not query.exec(): query_error(query) section_id = query.lastInsertId() return book_id def parse_section(self, src: str, href: str) -> None: newdom = xml.dom.getDOMImplementation().createDocument("", "html", None) def strip_node(elm: xml.dom.minidom.Element) -> xml.dom.minidom.Element: if elm.nodeType == xml.dom.Node.TEXT_NODE: return cast( xml.dom.minidom.Element, newdom.createTextNode(cast(xml.dom.minidom.Text, elm).data), ) newelm: xml.dom.minidom.Element = newdom.createElement( elm.localName ) node = elm.firstChild while node: elm = cast(xml.dom.minidom.Element, node) if elm.nodeType == xml.dom.Node.TEXT_NODE: text = cast(xml.dom.minidom.Text, elm).data if text: text = text.strip() if text and len(text) > 0: newelm.appendChild(newdom.createTextNode(text)) elif elm.localName == "img": pass elif elm.nodeType == xml.dom.Node.COMMENT_NODE: pass elif elm.localName == "a": a_node = cast(xml.dom.minidom.Element, elm.firstChild) while a_node: if a_node.nodeType == xml.dom.Node.TEXT_NODE: text = cast(xml.dom.minidom.Text, a_node) newelm.appendChild(newdom.createTextNode(text.data)) else: newelm.appendChild(strip_node(a_node)) a_node = a_node.nextSibling else: newelm.appendChild(strip_node(elm)) node = node.nextSibling return newelm def parse_node( parent: xml.dom.minidom.Element, elm: xml.dom.minidom.Element ) -> None: if elm.nodeType == xml.dom.Node.ELEMENT_NODE: tag: str = cast(str, elm.localName) if tag.startswith("h"): clone = strip_node(elm) parent.appendChild(clone) elif tag == "p": clone = strip_node(elm) clone.normalize() parent.appendChild(clone) else: node = elm.firstChild while node: parse_node(parent, cast(xml.dom.minidom.Element, node)) node = node.nextSibling return with open(f"{src}/{href}") as f: dom = xml.dom.minidom.parse(f) # title = dom.getElementsByTagName("title")[0].firstChild.data body = dom.getElementsByTagName("body")[0] section = newdom.createElement("body") node = body.firstChild while node: parse_node(section, cast(xml.dom.minidom.Element, node)) node = node.nextSibling try: self.sections.append(section.toxml()) except Exception as e: print(src, href) raise e return