]> jaekl.net Git - quanlib.git/commitdiff
(Add scanner.opf which was forgotten in earlier commit)
authorChris Jaekl <chris@jaekl.net>
Thu, 27 Jun 2024 02:56:10 +0000 (22:56 -0400)
committerChris Jaekl <chris@jaekl.net>
Thu, 27 Jun 2024 02:56:10 +0000 (22:56 -0400)
app/scanner/opf.rb [new file with mode: 0644]

diff --git a/app/scanner/opf.rb b/app/scanner/opf.rb
new file mode 100644 (file)
index 0000000..d70fa8b
--- /dev/null
@@ -0,0 +1,151 @@
+# frozen_string_literal: true
+
+require "nokogiri"
+require "zip"
+
+require "scanner/scanner"
+
+module Scanner
+  # Scans an .epub file for information about the book it represents
+  class Opf
+    def initialize(book)
+      @book = book
+    end
+
+    def scan_opf(zipfile, opf_path)
+      @cover_id = "cover-image"
+
+      opf_xml = zipfile.read(opf_path)
+      opf_doc = Nokogiri::XML(opf_xml)
+
+      load_author!(opf_doc)
+      load_title!(opf_doc)
+      load_description!(opf_doc)
+      load_language!(opf_doc)
+      load_other_metadata!(opf_doc)
+      @book.cover = load_cover(zipfile, opf_path, opf_doc, @cover_id)
+
+      @book
+    end
+
+    private
+
+    DC_NS_URL = "http://purl.org/dc/elements/1.1/"
+    private_constant :DC_NS_URL
+
+    def find_cover_entry(zipfile, href, opf_path)
+      entry = zipfile.find_entry(href)
+
+      if entry.nil?
+        # Although the epub standard requires the path to be relative
+        # to the base of the epub (zip), some books encountered in the
+        # wild have been found to use a path relative to the location
+        # of the opf file.
+        parts = opf_path.split("/")
+        opf_base_path = parts[0..-2].join("/")
+        cover_path = "#{opf_base_path}/#{href}"
+        entry = zipfile.find_entry(cover_path)
+      end
+
+      if !entry && href.start_with?("../")
+        # Another case found in the wild:  cover image is at the root, but path is '../cover.jpeg'
+        cover_path = href[3..]
+        entry = zipfile.find_entry(cover_path)
+      end
+
+      entry
+    end
+
+    def load_author!(opf_doc)
+      grouping = @book.author.grouping
+      reading_order = @book.author.reading_order
+      sort_order = @book.author.sort_order
+
+      creators = opf_doc.css("dc|creator", "dc" => DC_NS_URL)
+
+      return if creators.empty?
+
+      creator = creators[0]
+
+      return if creator.nil?
+
+      role = creator["opf:role"]
+      if role == "aut"
+        reading_order = creator.content
+
+        file_as = creator["opf:file-as"]
+        sort_order = file_as unless file_as.nil?
+      end
+
+      @book.author = Author.new(grouping, reading_order, sort_order)
+    end
+
+    def load_cover(zipfile, opf_path, opf_doc, cover_id)
+      items = opf_doc.css("package manifest item")
+      items.each do |i|
+        href = i["href"]
+        id = i["id"]
+        mime_type = i["media-type"]
+
+        next unless cover_id == id
+
+        entry = find_cover_entry(zipfile, href, opf_path)
+
+        if entry.nil?
+          puts "WARNING!  Cover image #{href.inspect} not found in file #{@book.path.inspect}."
+          return nil
+        else
+          entry.get_input_stream do |is|
+            return Cover.new(is, href, mime_type)
+          end
+        end
+      end
+
+      nil
+    end
+
+    def load_description!(opf_doc)
+      descr_nodes = opf_doc.css("dc|description", "dc" => DC_NS_URL)
+      return if descr_nodes.empty?
+
+      descr_node = descr_nodes[0]
+      @book.description = descr_node.content unless descr_node.nil?
+    end
+
+    def load_language!(opf_doc)
+      lang_nodes = opf_doc.css("dc|language", "dc" => DC_NS_URL)
+      return if lang_nodes.empty?
+
+      lang_node = lang_nodes[0]
+      @book.language = lang_node.content if lang_node
+    end
+
+    def load_other_metadata!(opf_doc)
+      # Other metadata:  series, volume, cover
+
+      metas = opf_doc.css("package metadata meta")
+      metas.each do |m|
+        name = m["name"]
+        content = m["content"]
+
+        case name
+        when "calibre:series"
+          # TODO:  Dynamically create a new series?
+          # @series_id = content
+        when "calibre:series-index"
+          @book.volume = content
+        when "cover"
+          @cover_id = content
+        end
+      end
+    end
+
+    def load_title!(opf_doc)
+      titles = opf_doc.css("dc|title", "dc" => DC_NS_URL)
+      return if titles.empty?
+
+      title = titles[0]
+      @book.title = title.content unless title.nil?
+    end
+  end
+end