68 lines
2.2 KiB
Python
68 lines
2.2 KiB
Python
import json
|
|
import os
|
|
import xml.dom.minidom
|
|
|
|
|
|
class Book:
|
|
sections = []
|
|
metadata = {}
|
|
|
|
def __init__(self, src: str) -> None:
|
|
super(Book, self).__init__()
|
|
self.parse_book(src)
|
|
return
|
|
|
|
def parse_book(self, src: str) -> None:
|
|
with open(f"{src}/content.opf", "r") as f:
|
|
dom = xml.dom.minidom.parse(f)
|
|
metadata = dom.getElementsByTagName("metadata")[0]
|
|
for meta in metadata.childNodes:
|
|
if meta.nodeType != xml.dom.Node.ELEMENT_NODE:
|
|
continue
|
|
if meta.prefix == "dc":
|
|
self.metadata[meta.localName] = meta.firstChild.data
|
|
#
|
|
# The manifest contains a list of all the files contained in this
|
|
# EPUB
|
|
#
|
|
manifest = dom.getElementsByTagName("manifest")[0]
|
|
#
|
|
# The spine contains the documents in order they are read
|
|
#
|
|
spine = dom.getElementsByTagName("spine")[0]
|
|
for itemref in spine.childNodes:
|
|
if itemref.nodeType != xml.dom.Node.ELEMENT_NODE:
|
|
continue
|
|
# If linear == "no" skip
|
|
if (
|
|
itemref.hasAttribute("linear")
|
|
and itemref.getAttribute("linear") == "no"
|
|
):
|
|
continue
|
|
idref = itemref.getAttribute("idref")
|
|
for item in manifest.childNodes:
|
|
if item.nodeType != xml.dom.Node.ELEMENT_NODE:
|
|
continue
|
|
id = item.getAttribute("id")
|
|
if id == idref:
|
|
break
|
|
#
|
|
# A properly created *.opf will always have a correct
|
|
# spin/manifest.
|
|
#
|
|
href = item.getAttribute("href")
|
|
print(f"{idref}: {href}")
|
|
self.parse_section(src, href)
|
|
return
|
|
|
|
def parse_section(self, src: str, href: str) -> None:
|
|
with open(f"{src}/{href}") as f:
|
|
dom = xml.dom.minidom.parse(f)
|
|
title = dom.getElementsByTagName("title")[0].firstChild.data
|
|
body = dom.getElementsByTagName("body")[0]
|
|
paragraphs = []
|
|
for p in body.getElementsByTagName("p"):
|
|
paragraphs.append(p.toxml())
|
|
self.sections.append({"title": title, "paragraphs": paragraphs})
|
|
return
|