Files
esl-reader/lib/books.py
Christopher T. Johnson 598201425c Working checkpoint
2023-11-10 21:48:15 -05:00

68 lines
2.2 KiB
Python

import json
import os
import xml.dom.minidom
class Book:
sections = []
metadata = {}
def __init__(self, src: str) -> None:
super(Book, self).__init__()
self.parse_book(src)
return
def parse_book(self, src: str) -> None:
with open(f"{src}/content.opf", "r") as f:
dom = xml.dom.minidom.parse(f)
metadata = dom.getElementsByTagName("metadata")[0]
for meta in metadata.childNodes:
if meta.nodeType != xml.dom.Node.ELEMENT_NODE:
continue
if meta.prefix == "dc":
self.metadata[meta.localName] = meta.firstChild.data
#
# The manifest contains a list of all the files contained in this
# EPUB
#
manifest = dom.getElementsByTagName("manifest")[0]
#
# The spine contains the documents in order they are read
#
spine = dom.getElementsByTagName("spine")[0]
for itemref in spine.childNodes:
if itemref.nodeType != xml.dom.Node.ELEMENT_NODE:
continue
# If linear == "no" skip
if (
itemref.hasAttribute("linear")
and itemref.getAttribute("linear") == "no"
):
continue
idref = itemref.getAttribute("idref")
for item in manifest.childNodes:
if item.nodeType != xml.dom.Node.ELEMENT_NODE:
continue
id = item.getAttribute("id")
if id == idref:
break
#
# A properly created *.opf will always have a correct
# spin/manifest.
#
href = item.getAttribute("href")
print(f"{idref}: {href}")
self.parse_section(src, href)
return
def parse_section(self, src: str, href: str) -> None:
with open(f"{src}/{href}") as f:
dom = xml.dom.minidom.parse(f)
title = dom.getElementsByTagName("title")[0].firstChild.data
body = dom.getElementsByTagName("body")[0]
paragraphs = []
for p in body.getElementsByTagName("p"):
paragraphs.append(p.toxml())
self.sections.append({"title": title, "paragraphs": paragraphs})
return