From: Chris Jaekl Date: Thu, 27 Jun 2024 02:56:10 +0000 (-0400) Subject: (Add scanner.opf which was forgotten in earlier commit) X-Git-Url: https://jaekl.net/gitweb/?a=commitdiff_plain;h=0d86ea9d09fe23a2980a59a551ac06dec0a2a7bd;p=quanlib.git (Add scanner.opf which was forgotten in earlier commit) --- diff --git a/app/scanner/opf.rb b/app/scanner/opf.rb new file mode 100644 index 0000000..d70fa8b --- /dev/null +++ b/app/scanner/opf.rb @@ -0,0 +1,151 @@ +# frozen_string_literal: true + +require "nokogiri" +require "zip" + +require "scanner/scanner" + +module Scanner + # Scans an .epub file for information about the book it represents + class Opf + def initialize(book) + @book = book + end + + def scan_opf(zipfile, opf_path) + @cover_id = "cover-image" + + opf_xml = zipfile.read(opf_path) + opf_doc = Nokogiri::XML(opf_xml) + + load_author!(opf_doc) + load_title!(opf_doc) + load_description!(opf_doc) + load_language!(opf_doc) + load_other_metadata!(opf_doc) + @book.cover = load_cover(zipfile, opf_path, opf_doc, @cover_id) + + @book + end + + private + + DC_NS_URL = "http://purl.org/dc/elements/1.1/" + private_constant :DC_NS_URL + + def find_cover_entry(zipfile, href, opf_path) + entry = zipfile.find_entry(href) + + if entry.nil? + # Although the epub standard requires the path to be relative + # to the base of the epub (zip), some books encountered in the + # wild have been found to use a path relative to the location + # of the opf file. + parts = opf_path.split("/") + opf_base_path = parts[0..-2].join("/") + cover_path = "#{opf_base_path}/#{href}" + entry = zipfile.find_entry(cover_path) + end + + if !entry && href.start_with?("../") + # Another case found in the wild: cover image is at the root, but path is '../cover.jpeg' + cover_path = href[3..] + entry = zipfile.find_entry(cover_path) + end + + entry + end + + def load_author!(opf_doc) + grouping = @book.author.grouping + reading_order = @book.author.reading_order + sort_order = @book.author.sort_order + + creators = opf_doc.css("dc|creator", "dc" => DC_NS_URL) + + return if creators.empty? + + creator = creators[0] + + return if creator.nil? + + role = creator["opf:role"] + if role == "aut" + reading_order = creator.content + + file_as = creator["opf:file-as"] + sort_order = file_as unless file_as.nil? + end + + @book.author = Author.new(grouping, reading_order, sort_order) + end + + def load_cover(zipfile, opf_path, opf_doc, cover_id) + items = opf_doc.css("package manifest item") + items.each do |i| + href = i["href"] + id = i["id"] + mime_type = i["media-type"] + + next unless cover_id == id + + entry = find_cover_entry(zipfile, href, opf_path) + + if entry.nil? + puts "WARNING! Cover image #{href.inspect} not found in file #{@book.path.inspect}." + return nil + else + entry.get_input_stream do |is| + return Cover.new(is, href, mime_type) + end + end + end + + nil + end + + def load_description!(opf_doc) + descr_nodes = opf_doc.css("dc|description", "dc" => DC_NS_URL) + return if descr_nodes.empty? + + descr_node = descr_nodes[0] + @book.description = descr_node.content unless descr_node.nil? + end + + def load_language!(opf_doc) + lang_nodes = opf_doc.css("dc|language", "dc" => DC_NS_URL) + return if lang_nodes.empty? + + lang_node = lang_nodes[0] + @book.language = lang_node.content if lang_node + end + + def load_other_metadata!(opf_doc) + # Other metadata: series, volume, cover + + metas = opf_doc.css("package metadata meta") + metas.each do |m| + name = m["name"] + content = m["content"] + + case name + when "calibre:series" + # TODO: Dynamically create a new series? + # @series_id = content + when "calibre:series-index" + @book.volume = content + when "cover" + @cover_id = content + end + end + end + + def load_title!(opf_doc) + titles = opf_doc.css("dc|title", "dc" => DC_NS_URL) + return if titles.empty? + + title = titles[0] + @book.title = title.content unless title.nil? + end + end +end