Working checkpoint
This commit is contained in:
67
lib/books.py
Normal file
67
lib/books.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import json
|
||||
import os
|
||||
import xml.dom.minidom
|
||||
|
||||
|
||||
class Book:
|
||||
sections = []
|
||||
metadata = {}
|
||||
|
||||
def __init__(self, src: str) -> None:
|
||||
super(Book, self).__init__()
|
||||
self.parse_book(src)
|
||||
return
|
||||
|
||||
def parse_book(self, src: str) -> None:
|
||||
with open(f"{src}/content.opf", "r") as f:
|
||||
dom = xml.dom.minidom.parse(f)
|
||||
metadata = dom.getElementsByTagName("metadata")[0]
|
||||
for meta in metadata.childNodes:
|
||||
if meta.nodeType != xml.dom.Node.ELEMENT_NODE:
|
||||
continue
|
||||
if meta.prefix == "dc":
|
||||
self.metadata[meta.localName] = meta.firstChild.data
|
||||
#
|
||||
# The manifest contains a list of all the files contained in this
|
||||
# EPUB
|
||||
#
|
||||
manifest = dom.getElementsByTagName("manifest")[0]
|
||||
#
|
||||
# The spine contains the documents in order they are read
|
||||
#
|
||||
spine = dom.getElementsByTagName("spine")[0]
|
||||
for itemref in spine.childNodes:
|
||||
if itemref.nodeType != xml.dom.Node.ELEMENT_NODE:
|
||||
continue
|
||||
# If linear == "no" skip
|
||||
if (
|
||||
itemref.hasAttribute("linear")
|
||||
and itemref.getAttribute("linear") == "no"
|
||||
):
|
||||
continue
|
||||
idref = itemref.getAttribute("idref")
|
||||
for item in manifest.childNodes:
|
||||
if item.nodeType != xml.dom.Node.ELEMENT_NODE:
|
||||
continue
|
||||
id = item.getAttribute("id")
|
||||
if id == idref:
|
||||
break
|
||||
#
|
||||
# A properly created *.opf will always have a correct
|
||||
# spin/manifest.
|
||||
#
|
||||
href = item.getAttribute("href")
|
||||
print(f"{idref}: {href}")
|
||||
self.parse_section(src, href)
|
||||
return
|
||||
|
||||
def parse_section(self, src: str, href: str) -> None:
|
||||
with open(f"{src}/{href}") as f:
|
||||
dom = xml.dom.minidom.parse(f)
|
||||
title = dom.getElementsByTagName("title")[0].firstChild.data
|
||||
body = dom.getElementsByTagName("body")[0]
|
||||
paragraphs = []
|
||||
for p in body.getElementsByTagName("p"):
|
||||
paragraphs.append(p.toxml())
|
||||
self.sections.append({"title": title, "paragraphs": paragraphs})
|
||||
return
|
||||
Reference in New Issue
Block a user