196 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			196 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import json
 | |
| import os
 | |
| import xml.dom.minidom
 | |
| from typing import Dict, List, cast
 | |
| 
 | |
| from PyQt6.QtSql import QSqlQuery
 | |
| 
 | |
| from main import query_error
 | |
| 
 | |
| 
 | |
| class Book:
 | |
|     sections: List[str] = []
 | |
|     metadata: Dict[str, str] = {}
 | |
| 
 | |
|     def __init__(self, src: str) -> None:
 | |
|         super(Book, self).__init__()
 | |
|         self.parse_book(src)
 | |
|         book_id = self.store()  # Does nothing if already in database
 | |
|         self.load(book_id)
 | |
|         return
 | |
| 
 | |
|     def load(self, book_id: int) -> None:
 | |
|         query = QSqlQuery()
 | |
|         query.prepare("SELECT * FROM books where book_id = :book_id")
 | |
|         query.bindValue(":book_id", book_id)
 | |
|         if not query.exec():
 | |
|             query_error(query)
 | |
|         if not query.next():
 | |
|             raise Exception(f"Missing book? book_id={book_id}")
 | |
|         self.metadata = {
 | |
|             "title": query.value("title"),
 | |
|             "creator": query.value("author"),
 | |
|             "identifier": query.value("uuid"),
 | |
|             "level": query.value("level"),
 | |
|         }
 | |
| 
 | |
|         self.sections = []
 | |
|         query.prepare(
 | |
|             "SELECT * FORM sections WHERE book_id = :book_id " "ORDER BY sequence"
 | |
|         )
 | |
|         while query.next():
 | |
|             self.sections.append(query.value("contents"))
 | |
|         #
 | |
|         # Load words!
 | |
|         #
 | |
|         return
 | |
| 
 | |
|     def parse_book(self, src: str) -> None:
 | |
|         with open(f"{src}/content.opf", "r") as f:
 | |
|             dom = xml.dom.minidom.parse(f)
 | |
|         metadata = dom.getElementsByTagName("metadata")[0]
 | |
|         for meta in metadata.childNodes:
 | |
|             if meta.nodeType != xml.dom.Node.ELEMENT_NODE:
 | |
|                 continue
 | |
|             if meta.prefix == "dc":
 | |
|                 self.metadata[meta.localName] = meta.firstChild.data
 | |
|         #
 | |
|         # The manifest contains a list of all the files contained in this
 | |
|         # EPUB
 | |
|         #
 | |
|         manifest = dom.getElementsByTagName("manifest")[0]
 | |
|         #
 | |
|         # The spine contains the documents in order they are read
 | |
|         #
 | |
|         spine = dom.getElementsByTagName("spine")[0]
 | |
|         for itemref in spine.childNodes:
 | |
|             if itemref.nodeType != xml.dom.Node.ELEMENT_NODE:
 | |
|                 continue
 | |
|             # If linear == "no" skip
 | |
|             if (
 | |
|                 itemref.hasAttribute("linear")
 | |
|                 and itemref.getAttribute("linear") == "no"
 | |
|             ):
 | |
|                 continue
 | |
|             idref = itemref.getAttribute("idref")
 | |
|             for item in manifest.childNodes:
 | |
|                 if item.nodeType != xml.dom.Node.ELEMENT_NODE:
 | |
|                     continue
 | |
|                 id = item.getAttribute("id")
 | |
|                 if id == idref:
 | |
|                     break
 | |
|             #
 | |
|             # A properly created *.opf will always have a correct
 | |
|             # spin/manifest.
 | |
|             #
 | |
|             href = item.getAttribute("href")
 | |
|             print(f"{idref}: {href}")
 | |
|             self.parse_section(src, href)
 | |
|         #
 | |
|         # "sections" is now loaded
 | |
|         #
 | |
|         return
 | |
| 
 | |
|     def store(self) -> int:
 | |
|         uuid = self.metadata["identifier"]
 | |
|         query = QSqlQuery()
 | |
|         query.prepare(
 | |
|             "SELECT COUNT(*) AS number, book_id FROM books b " "WHERE b.uuid = :uuid"
 | |
|         )
 | |
|         query.bindValue(":uuid", uuid)
 | |
|         if not query.exec():
 | |
|             query_error(query)
 | |
|         query.next()
 | |
|         if query.value("number") > 0:
 | |
|             book_id: int = query.value("book_id")
 | |
|             return book_id
 | |
|         query.prepare(
 | |
|             "INSERT INTO books (title, author, uuid, level) VALUES ("
 | |
|             ":title, :author, :uuid, 0)"
 | |
|         )
 | |
|         query.bindValue(":title", self.metadata["title"])
 | |
|         query.bindValue(":author", self.metadata["creator"])
 | |
|         query.bindValue(":uuid", uuid)
 | |
|         if not query.exec():
 | |
|             query_error(query)
 | |
|         book_id = query.lastInsertId()
 | |
|         query.prepare(
 | |
|             "INSERT INTO sections (sequence, book_id, content) "
 | |
|             "VALUES (:sequence, :book_id, :content)"
 | |
|         )
 | |
|         query.bindValue(":book_id", book_id)
 | |
|         for seq, section in enumerate(self.sections):
 | |
|             query.bindValue(":sequence", seq)
 | |
|             query.bindValue(":content", section)
 | |
|             if not query.exec():
 | |
|                 query_error(query)
 | |
|             section_id = query.lastInsertId()
 | |
|         return book_id
 | |
| 
 | |
|     def parse_section(self, src: str, href: str) -> None:
 | |
|         newdom = xml.dom.getDOMImplementation().createDocument("", "html", None)
 | |
| 
 | |
|         def strip_node(elm: xml.dom.minidom.Element) -> xml.dom.minidom.Element:
 | |
|             if elm.nodeType == xml.dom.Node.TEXT_NODE:
 | |
|                 return cast(
 | |
|                     xml.dom.minidom.Element,
 | |
|                     newdom.createTextNode(cast(xml.dom.minidom.Text, elm).data),
 | |
|                 )
 | |
| 
 | |
|             newelm: xml.dom.minidom.Element = newdom.createElement(elm.localName)
 | |
|             node = elm.firstChild
 | |
|             while node:
 | |
|                 elm = cast(xml.dom.minidom.Element, node)
 | |
|                 if elm.nodeType == xml.dom.Node.TEXT_NODE:
 | |
|                     text = cast(xml.dom.minidom.Text, elm).data
 | |
|                     if text:
 | |
|                         text = text.strip()
 | |
|                     if text and len(text) > 0:
 | |
|                         newelm.appendChild(newdom.createTextNode(text))
 | |
|                 elif elm.localName == "img":
 | |
|                     pass
 | |
|                 elif elm.localName == "a":
 | |
|                     a_node = cast(xml.dom.minidom.Element, elm.firstChild)
 | |
|                     while a_node:
 | |
|                         if a_node.nodeType == xml.dom.Node.TEXT_NODE:
 | |
|                             text = cast(xml.dom.minidom.Text, a_node)
 | |
|                             newelm.appendChild(newdom.createTextNode(text.data))
 | |
|                         else:
 | |
|                             newelm.appendChild(strip_node(a_node))
 | |
|                         a_node = a_node.nextSibling
 | |
|                 else:
 | |
|                     newelm.appendChild(strip_node(elm))
 | |
|                 node = node.nextSibling
 | |
|             return newelm
 | |
| 
 | |
|         def parse_node(
 | |
|             parent: xml.dom.minidom.Element, elm: xml.dom.minidom.Element
 | |
|         ) -> None:
 | |
|             if elm.nodeType == xml.dom.Node.ELEMENT_NODE:
 | |
|                 tag: str = cast(str, elm.localName)
 | |
|                 if tag.startswith("h"):
 | |
|                     clone = strip_node(elm)
 | |
|                     parent.appendChild(clone)
 | |
|                 elif tag == "p":
 | |
|                     clone = strip_node(elm)
 | |
|                     clone.normalize()
 | |
|                     parent.appendChild(clone)
 | |
|                 else:
 | |
|                     node = elm.firstChild
 | |
|                     while node:
 | |
|                         parse_node(parent, cast(xml.dom.minidom.Element, node))
 | |
|                         node = node.nextSibling
 | |
|             return
 | |
| 
 | |
|         with open(f"{src}/{href}") as f:
 | |
|             dom = xml.dom.minidom.parse(f)
 | |
|         # title = dom.getElementsByTagName("title")[0].firstChild.data
 | |
|         body = dom.getElementsByTagName("body")[0]
 | |
|         section = newdom.createElement("body")
 | |
|         node = body.firstChild
 | |
|         while node:
 | |
|             parse_node(section, cast(xml.dom.minidom.Element, node))
 | |
|             node = node.nextSibling
 | |
|         self.sections.append(section.toxml())
 | |
|         return
 |