Move to chapter based reader

This commit is contained in:
Christopher T. Johnson
2023-11-15 17:25:13 -05:00
parent 0b02ed2201
commit 2b2f461d2f
5 changed files with 219 additions and 102 deletions

View File

@@ -11,7 +11,6 @@ from main import query_error
class Book:
sections: List[str] = []
metadata: Dict[str, str] = {}
words = {}
def __init__(self, src: str) -> None:
super(Book, self).__init__()
@@ -131,61 +130,66 @@ class Book:
def parse_section(self, src: str, href: str) -> None:
newdom = xml.dom.getDOMImplementation().createDocument("", "html", None)
def strip_node(elm: xml.dom.minidom.Element) -> xml.dom.minidom.Node:
def strip_node(elm: xml.dom.minidom.Element) -> xml.dom.minidom.Element:
if elm.nodeType == xml.dom.Node.TEXT_NODE:
return cast(
xml.dom.minidom.Node,
xml.dom.minidom.Element,
newdom.createTextNode(cast(xml.dom.minidom.Text, elm).data),
)
newelm = newdom.createElement(elm.localName)
newelm: xml.dom.minidom.Element = newdom.createElement(elm.localName)
node = elm.firstChild
while node:
if node.nodeType == xml.dom.Node.TEXT_NODE:
text = node.data
elm = cast(xml.dom.minidom.Element, node)
if elm.nodeType == xml.dom.Node.TEXT_NODE:
text = cast(xml.dom.minidom.Text, elm).data
if text:
text = text.strip()
if text and len(text) > 0:
newelm.appendChild(newdom.createTextNode(text))
elif node.localName == "img":
elif elm.localName == "img":
pass
elif node.localName == "a":
a_node = node.firstChild
elif elm.localName == "a":
a_node = cast(xml.dom.minidom.Element, elm.firstChild)
while a_node:
if a_node.nodeType == xml.dom.Node.TEXT_NODE:
newelm.appendChild(newdom.createTextNode(a_node.data))
text = cast(xml.dom.minidom.Text, a_node)
newelm.appendChild(newdom.createTextNode(text.data))
else:
newelm.appendChild(strip_node(a_node))
a_node = a_node.nextSibling
else:
newelm.appendChild(strip_node(node))
newelm.appendChild(strip_node(elm))
node = node.nextSibling
return newelm
def parse_node(parent: xml.dom.Node, elm: xml.dom.Node) -> None:
def parse_node(
parent: xml.dom.minidom.Element, elm: xml.dom.minidom.Element
) -> None:
if elm.nodeType == xml.dom.Node.ELEMENT_NODE:
if elm.localName.startswith("h"):
tag: str = cast(str, elm.localName)
if tag.startswith("h"):
clone = strip_node(elm)
parent.appendChild(clone)
elif elm.localName == "p":
elif tag == "p":
clone = strip_node(elm)
clone.normalize()
parent.appendChild(clone)
else:
node = elm.firstChild
while node:
parse_node(parent, node)
parse_node(parent, cast(xml.dom.minidom.Element, node))
node = node.nextSibling
return
with open(f"{src}/{href}") as f:
dom = xml.dom.minidom.parse(f)
title = dom.getElementsByTagName("title")[0].firstChild.data
# title = dom.getElementsByTagName("title")[0].firstChild.data
body = dom.getElementsByTagName("body")[0]
section = newdom.createElement("body")
node = body.firstChild
while node:
parse_node(section, node)
parse_node(section, cast(xml.dom.minidom.Element, node))
node = node.nextSibling
self.sections.append(section.toxml())
return