208 lines
7.4 KiB
Python
208 lines
7.4 KiB
Python
|
|
import xml.dom.minidom
|
|
from typing import Dict, List, cast
|
|
|
|
from PyQt6.QtCore import QCoreApplication
|
|
from PyQt6.QtSql import QSqlQuery
|
|
|
|
from lib import query_error
|
|
|
|
|
|
class Book:
|
|
sections: List[str] = []
|
|
metadata: Dict[str, str] = {}
|
|
|
|
def __init__(self, src: str) -> None:
|
|
super(Book, self).__init__()
|
|
self.parse_book(src)
|
|
book_id = self.store() # Does nothing if already in database
|
|
self.load(book_id)
|
|
return
|
|
|
|
def load(self, book_id: int) -> None:
|
|
translate = QCoreApplication.translate
|
|
query = QSqlQuery()
|
|
query.prepare("SELECT * FROM books where book_id = :book_id")
|
|
query.bindValue(":book_id", book_id)
|
|
if not query.exec():
|
|
query_error(query)
|
|
if not query.next():
|
|
raise Exception(
|
|
translate("Book", "Missing book? book_id=") + f"{book_id}"
|
|
)
|
|
self.metadata = {
|
|
"title": query.value("title"),
|
|
"creator": query.value("author"),
|
|
"identifier": query.value("uuid"),
|
|
"level": query.value("level"),
|
|
}
|
|
|
|
self.sections = []
|
|
query.prepare(
|
|
"SELECT * FORM sections WHERE book_id = :book_id "
|
|
"ORDER BY sequence"
|
|
)
|
|
while query.next():
|
|
self.sections.append(query.value("contents"))
|
|
#
|
|
# Load words!
|
|
#
|
|
return
|
|
|
|
def parse_book(self, src: str) -> None:
|
|
with open(f"{src}/content.opf", "r") as f:
|
|
dom = xml.dom.minidom.parse(f)
|
|
metadata = dom.getElementsByTagName("metadata")[0]
|
|
for meta in metadata.childNodes:
|
|
if meta.nodeType != xml.dom.Node.ELEMENT_NODE:
|
|
continue
|
|
if meta.prefix == "dc":
|
|
self.metadata[meta.localName] = meta.firstChild.data
|
|
#
|
|
# The manifest contains a list of all the files contained in this
|
|
# EPUB
|
|
#
|
|
manifest = dom.getElementsByTagName("manifest")[0]
|
|
#
|
|
# The spine contains the documents in order they are read
|
|
#
|
|
spine = dom.getElementsByTagName("spine")[0]
|
|
for itemref in spine.childNodes:
|
|
if itemref.nodeType != xml.dom.Node.ELEMENT_NODE:
|
|
continue
|
|
# If linear == "no" skip
|
|
if (
|
|
itemref.hasAttribute("linear")
|
|
and itemref.getAttribute("linear") == "no"
|
|
):
|
|
continue
|
|
idref = itemref.getAttribute("idref")
|
|
for item in manifest.childNodes:
|
|
if item.nodeType != xml.dom.Node.ELEMENT_NODE:
|
|
continue
|
|
id = item.getAttribute("id")
|
|
if id == idref:
|
|
break
|
|
#
|
|
# A properly created *.opf will always have a correct
|
|
# spin/manifest.
|
|
#
|
|
href = item.getAttribute("href")
|
|
print(f"{idref}: {href}")
|
|
self.parse_section(src, href)
|
|
#
|
|
# "sections" is now loaded
|
|
#
|
|
return
|
|
|
|
def store(self) -> int:
|
|
uuid = self.metadata["identifier"]
|
|
query = QSqlQuery()
|
|
query.prepare(
|
|
"SELECT COUNT(*) AS number, book_id FROM books b "
|
|
"WHERE b.uuid = :uuid"
|
|
)
|
|
query.bindValue(":uuid", uuid)
|
|
if not query.exec():
|
|
query_error(query)
|
|
query.next()
|
|
if query.value("number") > 0:
|
|
book_id: int = query.value("book_id")
|
|
return book_id
|
|
query.prepare(
|
|
"INSERT INTO books (title, author, uuid, level) VALUES ("
|
|
":title, :author, :uuid, 0)"
|
|
)
|
|
query.bindValue(":title", self.metadata["title"])
|
|
query.bindValue(":author", self.metadata["creator"])
|
|
query.bindValue(":uuid", uuid)
|
|
if not query.exec():
|
|
query_error(query)
|
|
book_id = query.lastInsertId()
|
|
query.prepare(
|
|
"INSERT INTO sections (sequence, book_id, content) "
|
|
"VALUES (:sequence, :book_id, :content)"
|
|
)
|
|
query.bindValue(":book_id", book_id)
|
|
for seq, section in enumerate(self.sections):
|
|
query.bindValue(":sequence", seq)
|
|
query.bindValue(":content", section)
|
|
if not query.exec():
|
|
query_error(query)
|
|
return book_id
|
|
|
|
def parse_section(self, src: str, href: str) -> None:
|
|
newdom = xml.dom.getDOMImplementation().createDocument("", "html", None)
|
|
|
|
def strip_node(elm: xml.dom.minidom.Element) -> xml.dom.minidom.Element:
|
|
if elm.nodeType == xml.dom.Node.TEXT_NODE:
|
|
return cast(
|
|
xml.dom.minidom.Element,
|
|
newdom.createTextNode(cast(xml.dom.minidom.Text, elm).data),
|
|
)
|
|
|
|
newelm: xml.dom.minidom.Element = newdom.createElement(
|
|
elm.localName
|
|
)
|
|
node = elm.firstChild
|
|
while node:
|
|
elm = cast(xml.dom.minidom.Element, node)
|
|
if elm.nodeType == xml.dom.Node.TEXT_NODE:
|
|
text = cast(xml.dom.minidom.Text, elm).data
|
|
if text:
|
|
text = text.strip()
|
|
if text and len(text) > 0:
|
|
newelm.appendChild(newdom.createTextNode(text))
|
|
elif elm.localName == "img":
|
|
pass
|
|
elif elm.nodeType == xml.dom.Node.COMMENT_NODE:
|
|
pass
|
|
elif elm.localName == "a":
|
|
a_node = cast(xml.dom.minidom.Element, elm.firstChild)
|
|
while a_node:
|
|
if a_node.nodeType == xml.dom.Node.TEXT_NODE:
|
|
text = cast(xml.dom.minidom.Text, a_node)
|
|
newelm.appendChild(newdom.createTextNode(text.data))
|
|
else:
|
|
newelm.appendChild(strip_node(a_node))
|
|
a_node = a_node.nextSibling
|
|
else:
|
|
newelm.appendChild(strip_node(elm))
|
|
node = node.nextSibling
|
|
return newelm
|
|
|
|
def parse_node(
|
|
parent: xml.dom.minidom.Element, elm: xml.dom.minidom.Element
|
|
) -> None:
|
|
if elm.nodeType == xml.dom.Node.ELEMENT_NODE:
|
|
tag: str = cast(str, elm.localName)
|
|
if tag.startswith("h"):
|
|
clone = strip_node(elm)
|
|
parent.appendChild(clone)
|
|
elif tag == "p":
|
|
clone = strip_node(elm)
|
|
clone.normalize()
|
|
parent.appendChild(clone)
|
|
else:
|
|
node = elm.firstChild
|
|
while node:
|
|
parse_node(parent, cast(xml.dom.minidom.Element, node))
|
|
node = node.nextSibling
|
|
return
|
|
|
|
with open(f"{src}/{href}") as f:
|
|
dom = xml.dom.minidom.parse(f)
|
|
# title = dom.getElementsByTagName("title")[0].firstChild.data
|
|
body = dom.getElementsByTagName("body")[0]
|
|
section = newdom.createElement("body")
|
|
node = body.firstChild
|
|
while node:
|
|
parse_node(section, cast(xml.dom.minidom.Element, node))
|
|
node = node.nextSibling
|
|
try:
|
|
self.sections.append(section.toxml())
|
|
except Exception as e:
|
|
print(src, href)
|
|
raise e
|
|
return
|