From: Chris Jaekl <chris@jaekl.net>
Date: Mon, 24 Jun 2024 18:42:13 +0000 (-0400)
Subject: Introduces a new class Scanner and subclasses Epub and Pdf.
X-Git-Url: http://jaekl.net/gitweb/?a=commitdiff_plain;h=b00ba7ecee670d7fb05a42b580acaf546d024915;p=quanlib.git

Introduces a new class Scanner and subclasses Epub and Pdf.

This enables us to split out some of the (overly-large) class Book
into separate files, moving us toward more, smaller, classes.
---

diff --git a/app/book.rb b/app/book.rb
index 3449b6d..417f7a2 100644
--- a/app/book.rb
+++ b/app/book.rb
@@ -11,9 +11,6 @@ require_relative "store"
 
 # Encapsulates info about a book in the library
 class Book
-  @@dc_ns_url = "http://purl.org/dc/elements/1.1/"
-  @@series_and_volume_regex = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/
-
   attr_accessor(
     :arrived,
     :author,
@@ -31,11 +28,6 @@ class Book
     @store = store
   end
 
-  def load_from_file!(file_name)
-    @path = file_name
-    parse_file_name!(file_name)
-  end
-
   def self.can_handle?(file_name)
     return false if file_name.nil?
 
@@ -124,236 +116,4 @@ class Book
 
     reading_order
   end
-
-  # Returns (series, volumeNo, titleText)
-  def process_title(input)
-    return if input.nil?
-
-    arr = input.split("_")
-
-    series = nil
-    vol = nil
-
-    first = arr[0]
-    match_data = first.match(@@series_and_volume_regex)
-    unless match_data.nil?
-      capt = match_data.captures
-      series = capt[0]
-      vol = capt[1]
-      arr.shift
-    end
-
-    pos = arr[-1].rindex(".")
-    arr[-1] = arr[-1].slice(0, pos) unless pos.nil?
-
-    title = arr.join(" ")
-
-    bare_title_grouping =
-      title_grouping
-      .split("_")
-      .reject { |part| part.match(@@series_and_volume_regex) }
-      .join("_")
-
-    unless bare_title_grouping == Book.grouping_for_title(title)
-      discrepancy = "#{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}"
-      puts "WARNING:  title_grouping mismatch:  #{discrepancy}"
-    end
-
-    [series, vol, title]
-  end
-
-  def parse_file_name!(file_name)
-    category = nil # e.g., non-fiction, fan-fiction
-    grouping = ""
-
-    parts = file_name.split("/")
-    (series_code, @volume, @title) = process_title(parts[-1])
-    if parts.length > 1
-      grouping = parts[-2]
-      reading_order = massage_author(grouping)
-      sort_order = nil
-      @author = Author.new(grouping, reading_order, sort_order)
-      @series_id = @store.get_series(grouping, series_code)
-    end
-    category = parts[-3] if parts.length > 2
-
-    lc_file_name = file_name.downcase
-    if lc_file_name.end_with?(".epub")
-      scan_epub!(file_name)
-    elsif lc_file_name.end_with?(".pdf")
-      scan_pdf!(file_name)
-    end
-
-    @arrived = File.ctime(file_name)
-
-    @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, ".*"))
-
-    # TODO:  Fix horrible hard-coded strings and paths
-    return unless category == "00_nonFic" && classification_id.nil?
-
-    File.open(Store.unclassified_csv, "a") do |fd|
-      fd.puts "#{grouping.inspect},#{path.inspect}"
-    end
-  end
-
-  def scan_epub!(file_name)
-    Zip.warn_invalid_date = false
-    Zip::File.open(file_name) do |zipfile|
-      entry = zipfile.find_entry("META-INF/container.xml")
-      if entry.nil?
-        puts "No META-INF/container.xml, skipping book #{file_name.inspect}"
-        return nil
-      end
-      cont_xml = zipfile.read("META-INF/container.xml")
-      cont_doc = Nokogiri::XML(cont_xml)
-      opf_path = cont_doc.css("container rootfiles rootfile")[0]["full-path"]
-
-      scan_opf!(zipfile, opf_path)
-    rescue Zip::Error => e
-      puts "ERROR processing file #{file_name.inspect}:"
-      puts e.message
-      puts e.backtrace
-    end
-  end
-
-  def scan_pdf!(file_name)
-    pdf_path = File.expand_path(file_name).to_s
-    unless pdf_path.end_with?(".pdf")
-      puts "Unexpected internal error:  path #{file_name.inspect} does not end with \".pdf\"."
-      return
-    end
-
-    jpeg_path = "#{pdf_path[0..-5]}.jpeg"
-
-    return unless File.file?(jpeg_path)
-
-    File.open(jpeg_path, "r") do |is|
-      @cover = Cover.new(is, jpeg_path, "image/jpeg")
-    end
-  end
-
-  def scan_opf!(zipfile, opf_path)
-    cover_id = nil
-
-    opf_xml = zipfile.read(opf_path)
-    opf_doc = Nokogiri::XML(opf_xml)
-
-    #-------
-    # Author
-
-    grouping = @author.grouping
-    reading_order = @author.reading_order
-    sort_order = @author.sort_order
-
-    creators = opf_doc.css("dc|creator", "dc" => @@dc_ns_url)
-    unless creators.empty?
-      creator = creators[0]
-
-      return if creator.nil?
-
-      role = creator["opf:role"]
-      if role == "aut"
-        reading_order = creator.content
-
-        file_as = creator["opf:file-as"]
-        sort_order = file_as unless file_as.nil?
-      end
-
-      @author = Author.new(grouping, reading_order, sort_order)
-    end
-
-    #---------------------------------------
-    # Title
-
-    titles = opf_doc.css("dc|title", "dc" => @@dc_ns_url)
-    unless titles.empty?
-      title = titles[0]
-      @title = title.content unless title.nil?
-    end
-
-    #---------------------------------------
-    # Description
-
-    descr_nodes = opf_doc.css("dc|description", "dc" => @@dc_ns_url)
-    unless descr_nodes.empty?
-      descr_node = descr_nodes[0]
-      @description = descr_node.content unless descr_node.nil?
-    end
-
-    #---------------------------------------
-    # Language
-
-    lang_nodes = opf_doc.css("dc|language", "dc" => @@dc_ns_url)
-    unless lang_nodes.empty?
-      lang_node = lang_nodes[0]
-      @language = lang_node.content if lang_node
-    end
-
-    #---------------------------------------
-    # Other metadata:  series, volume, cover
-
-    metas = opf_doc.css("package metadata meta")
-    metas.each do |m|
-      name = m["name"]
-      content = m["content"]
-
-      case name
-      when "calibre:series"
-        # TODO:  Dynamically create a new series?
-        # @series_id = content
-      when "calibre:series-index"
-        @volume = content
-      when "cover"
-        cover_id = content
-      end
-    end
-
-    #---------------
-    # Load the cover
-
-    @cover = load_cover(zipfile, opf_path, opf_doc, cover_id)
-  end
-
-  def load_cover(zipfile, opf_path, opf_doc, cover_id)
-    cover_id = "cover-image" if cover_id.nil?
-
-    items = opf_doc.css("package manifest item")
-    items.each do |i|
-      href = i["href"]
-      id = i["id"]
-      mime_type = i["media-type"]
-
-      next unless cover_id == id
-
-      entry = zipfile.find_entry(href)
-
-      if entry.nil?
-        # Although the epub standard requires the path to be relative
-        # to the base of the epub (zip), some books encountered in the
-        # wild have been found to use a bath relative to the location
-        # of the opf file.
-        parts = opf_path.split("/")
-        opf_base_path = parts[0..-2].join("/")
-        cover_path = "#{opf_base_path}/#{href}"
-        entry = zipfile.find_entry(cover_path)
-      end
-
-      if !entry && href.start_with?("../")
-        # Another case found in the wild:  cover image is at the root, but path is '../cover.jpeg'
-        cover_path = href[3..]
-        entry = zipfile.find_entry(cover_path)
-      end
-
-      if entry.nil?
-        puts "WARNING!  Cover image #{href.inspect} not found in file #{@path.inspect}."
-        return nil
-      else
-        entry.get_input_stream do |is|
-          return Cover.new(is, href, mime_type)
-        end
-      end
-    end
-
-    nil
-  end
 end
diff --git a/app/book_loader.rb b/app/book_loader.rb
index ca963ff..41ae835 100644
--- a/app/book_loader.rb
+++ b/app/book_loader.rb
@@ -3,25 +3,46 @@
 require_relative "book"
 require_relative "store"
 
+require "scanner/epub"
+require "scanner/pdf"
+require "scanner/scanner"
+
 # Worker thread that pulls filenames from a queue and loads them as new books
 class BookLoader
   DONE_MARKER = "<END>"
 
-  def initialize(config_file, queue)
+  def initialize(config_file, queue, store)
     @config_file = config_file
     @queue = queue
+    @store = store
+
+    @scanners = [
+      Scanner::Epub.new(@store),
+      Scanner::Pdf.new(@store),
+    ]
+  end
+
+  def handles?(filename)
+    @scanners.any? do |scanner|
+      scanner.handles?(filename)
+    end
+  end
+
+  def load_file(filename)
+    @scanners.find do |scanner|
+      scanner.handles?(filename)
+    end&.scan_file(filename)
   end
 
   def run
-    @store = Store.new(@config_file)
     @store.connect
 
     file = @queue.pop
 
     until file == DONE_MARKER
-      book = Book.new(@store)
-      book.load_from_file!(file)
-      @store.store_book(book)
+      book = load_file(file)
+
+      @store.store_book(book) unless book.nil?
 
       file = @queue.pop
     end
diff --git a/app/scanner/epub.rb b/app/scanner/epub.rb
new file mode 100644
index 0000000..c4a6fb0
--- /dev/null
+++ b/app/scanner/epub.rb
@@ -0,0 +1,168 @@
+# frozen_string_literal: true
+
+require "nokogiri"
+require "zip"
+
+require "scanner/scanner"
+
+module Scanner
+  # Scans an .epub file for information about the book it represents
+  class Epub < Scanner
+    def handles?(filename)
+      filename&.downcase&.end_with?(".epub")
+    end
+
+    def scan_file(filename)
+      Zip.warn_invalid_date = false
+      Zip::File.open(filename) do |zipfile|
+        entry = zipfile.find_entry("META-INF/container.xml")
+        if entry.nil?
+          puts "No META-INF/container.xml, skipping book #{filename.inspect}"
+          return nil
+        end
+        cont_xml = zipfile.read("META-INF/container.xml")
+        cont_doc = Nokogiri::XML(cont_xml)
+        opf_path = cont_doc.css("container rootfiles rootfile")[0]["full-path"]
+
+        book = scan_base_attributes(filename)
+        scan_opf(book, zipfile, opf_path)
+      rescue Zip::Error => e
+        puts "ERROR processing file #{filename.inspect}:"
+        puts e.message
+        puts e.backtrace
+      end
+    end
+
+    private
+
+    DC_NS_URL = "http://purl.org/dc/elements/1.1/"
+    private_constant :DC_NS_URL
+
+    def scan_opf(book, zipfile, opf_path)
+      cover_id = nil
+
+      opf_xml = zipfile.read(opf_path)
+      opf_doc = Nokogiri::XML(opf_xml)
+
+      #-------
+      # Author
+
+      grouping = book.author.grouping
+      reading_order = book.author.reading_order
+      sort_order = book.author.sort_order
+
+      creators = opf_doc.css("dc|creator", "dc" => DC_NS_URL)
+      unless creators.empty?
+        creator = creators[0]
+
+        return if creator.nil?
+
+        role = creator["opf:role"]
+        if role == "aut"
+          reading_order = creator.content
+
+          file_as = creator["opf:file-as"]
+          sort_order = file_as unless file_as.nil?
+        end
+
+        book.author = Author.new(grouping, reading_order, sort_order)
+      end
+
+      #---------------------------------------
+      # Title
+
+      titles = opf_doc.css("dc|title", "dc" => DC_NS_URL)
+      unless titles.empty?
+        title = titles[0]
+        book.title = title.content unless title.nil?
+      end
+
+      #---------------------------------------
+      # Description
+
+      descr_nodes = opf_doc.css("dc|description", "dc" => DC_NS_URL)
+      unless descr_nodes.empty?
+        descr_node = descr_nodes[0]
+        book.description = descr_node.content unless descr_node.nil?
+      end
+
+      #---------------------------------------
+      # Language
+
+      lang_nodes = opf_doc.css("dc|language", "dc" => DC_NS_URL)
+      unless lang_nodes.empty?
+        lang_node = lang_nodes[0]
+        book.language = lang_node.content if lang_node
+      end
+
+      #---------------------------------------
+      # Other metadata:  series, volume, cover
+
+      metas = opf_doc.css("package metadata meta")
+      metas.each do |m|
+        name = m["name"]
+        content = m["content"]
+
+        case name
+        when "calibre:series"
+          # TODO:  Dynamically create a new series?
+          # @series_id = content
+        when "calibre:series-index"
+          book.volume = content
+        when "cover"
+          cover_id = content
+        end
+      end
+
+      #---------------
+      # Load the cover
+
+      book.cover = load_cover(zipfile, opf_path, opf_doc, cover_id)
+
+      book
+    end
+
+    def load_cover(zipfile, opf_path, opf_doc, cover_id)
+      cover_id = "cover-image" if cover_id.nil?
+
+      items = opf_doc.css("package manifest item")
+      items.each do |i|
+        href = i["href"]
+        id = i["id"]
+        mime_type = i["media-type"]
+
+        next unless cover_id == id
+
+        entry = zipfile.find_entry(href)
+
+        if entry.nil?
+          # Although the epub standard requires the path to be relative
+          # to the base of the epub (zip), some books encountered in the
+          # wild have been found to use a bath relative to the location
+          # of the opf file.
+          parts = opf_path.split("/")
+          opf_base_path = parts[0..-2].join("/")
+          cover_path = "#{opf_base_path}/#{href}"
+          entry = zipfile.find_entry(cover_path)
+        end
+
+        if !entry && href.start_with?("../")
+          # Another case found in the wild:  cover image is at the root, but path is '../cover.jpeg'
+          cover_path = href[3..]
+          entry = zipfile.find_entry(cover_path)
+        end
+
+        if entry.nil?
+          puts "WARNING!  Cover image #{href.inspect} not found in file #{@book.path.inspect}."
+          return nil
+        else
+          entry.get_input_stream do |is|
+            return Cover.new(is, href, mime_type)
+          end
+        end
+      end
+
+      nil
+    end
+  end
+end
diff --git a/app/scanner/pdf.rb b/app/scanner/pdf.rb
new file mode 100644
index 0000000..8c4a98f
--- /dev/null
+++ b/app/scanner/pdf.rb
@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+
+require "scanner/scanner"
+
+module Scanner
+  # Scans for information about a .pdf file
+  class Pdf < Scanner
+    def handles?(filename)
+      filename&.downcase&.end_with?(".pdf")
+    end
+
+    def scan_file(filename)
+      pdf_path = File.expand_path(filename).to_s
+      unless pdf_path.end_with?(".pdf")
+        puts "Unexpected internal error:  path #{filename.inspect} does not end with \".pdf\"."
+        return
+      end
+
+      book = scan_base_attributes(filename)
+
+      jpeg_path = "#{pdf_path[0..-5]}.jpeg"
+
+      return unless File.file?(jpeg_path)
+
+      File.open(jpeg_path, "r") do |is|
+        book.cover = Cover.new(is, jpeg_path, "image/jpeg")
+      end
+
+      book
+    end
+  end
+end
diff --git a/app/scanner/scanner.rb b/app/scanner/scanner.rb
new file mode 100644
index 0000000..bbeab1f
--- /dev/null
+++ b/app/scanner/scanner.rb
@@ -0,0 +1,120 @@
+# frozen_string_literal: true
+
+require "book"
+
+module Scanner
+  # Abstract base class for scanners that know how to gather information about an e-book file
+  class Scanner
+    def initialize(store)
+      @store = store
+    end
+
+    def scan_file(_filename)
+      raise "Not implemented (abstract base class)"
+    end
+
+    def handles?(_filename)
+      raise "Not implemented (abstract base class)"
+    end
+
+    private
+
+    NON_FIC_FOLDER = "00_nonFic"
+    private_constant :NON_FIC_FOLDER
+
+    SERIES_AND_VOLUME_REGEX = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/
+    private_constant :SERIES_AND_VOLUME_REGEX
+
+    # Returns (series, volumeNo, titleText)
+    def process_title(input)
+      return if input.nil?
+
+      arr = input.split("_")
+
+      series = nil
+      vol = nil
+
+      first = arr[0]
+      match_data = first.match(SERIES_AND_VOLUME_REGEX)
+      unless match_data.nil?
+        capt = match_data.captures
+        series = capt[0]
+        vol = capt[1]
+        arr.shift
+      end
+
+      pos = arr[-1].rindex(".")
+      arr[-1] = arr[-1].slice(0, pos) unless pos.nil?
+
+      title = arr.join(" ")
+
+      [series, vol, title]
+    end
+
+    def check_title_grouping(path)
+      bare_title_grouping =
+        title_grouping(path)
+        .split("_")
+        .reject { |part| part.match(SERIES_AND_VOLUME_REGEX) }
+        .join("_")
+
+      return if bare_title_grouping == Book.grouping_for_title(title)
+
+      discrepancy = "#{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}"
+      puts "WARNING:  title_grouping mismatch:  #{discrepancy}"
+    end
+
+    def massage_author(input)
+      return if input.nil?
+
+      reading_order = ""
+      input.each_char do |c|
+        reading_order += " " if upper?(c) && !reading_order.empty?
+        reading_order += c
+      end
+
+      reading_order
+    end
+
+    def scan_base_attributes(filename)
+      book = Book.new(@store)
+
+      book.path = filename
+
+      category = nil # e.g., non-fiction, fan-fiction
+      grouping = ""
+
+      parts = filename.split("/")
+      (series_code, book.volume, book.title) = process_title(parts[-1])
+      if parts.length > 1
+        grouping = parts[-2]
+        reading_order = massage_author(grouping)
+        sort_order = nil
+        book.author = Author.new(grouping, reading_order, sort_order)
+        book.series_id = @store.get_series(grouping, series_code)
+      end
+      category = parts[-3] if parts.length > 2
+
+      book.arrived = File.ctime(filename)
+      book.classification_id = @store.find_classification(book.author.grouping, File.basename(filename, ".*"))
+
+      return book unless category == NON_FIC_FOLDER && classification_id.nil?
+
+      File.open(Store.unclassified_csv, "a") do |fd|
+        fd.puts "#{grouping.inspect},#{path.inspect}"
+      end
+
+      book
+    end
+
+    def title_grouping(path)
+      return if path.nil?
+
+      File.basename(path, ".*")
+    end
+
+    def upper?(character)
+      /[[:upper:]]/.match(character)
+    end
+  end
+end
diff --git a/test/book_loader_test.rb b/test/book_loader_test.rb
new file mode 100644
index 0000000..fd0adb5
--- /dev/null
+++ b/test/book_loader_test.rb
@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+
+require "test_helper"
+
+require "book"
+require "store_mock"
+
+class BookLoaderTest < Minitest::Test
+  def setup
+    @queue = Queue.new
+    @store = StoreMock.new
+    @book_loader = BookLoader.new("/path/to/config.ini", @queue, @store)
+  end
+
+  def test_that_it_can_handle_epub_and_pdf_files
+    %w[epub pdf].each do |extension|
+      assert_equal true, @book_loader.handles?("sample.#{extension}")
+    end
+  end
+
+  def test_that_it_cannot_handle_mobi_html_txt_doc_zip_rtf_nor_rar
+    %w[doc html mobi rar rtf txt zip].each do |extension|
+      assert_equal false, @book_loader.handles?("sample.#{extension}")
+    end
+  end
+
+  def test_load_from_file
+    @store.expects(:get_series).returns(mock_series_lw)
+    @store.connect
+
+    book = @book_loader.load_file(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub"))
+
+    author = book.author
+
+    assert_equal "LouisaAlcott", author.grouping
+    assert_equal "Louisa May Alcott", author.reading_order
+    assert_equal "Alcott, Louisa May", author.sort_order
+
+    expected_descr = "This story follows the lives of the four March sisters&emdash;Meg, Jo, Beth, and Amy&emdash;" \
+      "and details their coming of age."
+
+    assert_equal expected_descr, book.description
+    assert_equal "en", book.language
+    assert_equal "Little Women: Or, Meg, Jo, Beth and Amy", book.title
+    assert_equal mock_series_lw.to_s, book.series_id.to_s
+    assert_equal 1, book.volume.to_i
+  end
+
+  def test_heading
+    @store.expects(:get_series).returns(mock_series_lw)
+    @store.connect
+
+    book = @book_loader.load_file(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub"))
+
+    expected = "<b>Little Women: Or, Meg, Jo, Beth and Amy</b><br/><i>by Louisa May Alcott</i><br/>01"
+    actual = book.heading
+
+    assert_equal expected, actual
+  end
+
+  private
+
+  def mock_series_lw
+    id = 1
+    series = Series.new(id)
+    series.age = "ya"
+    series.genre = "romance"
+    series.grouping = "LoisaAlcott"
+    series.code = "LW"
+    series.descr = "Little Women"
+
+    series
+  end
+end
diff --git a/test/book_test.rb b/test/book_test.rb
index 034ae04..a15cc84 100644
--- a/test/book_test.rb
+++ b/test/book_test.rb
@@ -31,44 +31,6 @@ class BookTest < Minitest::Test
     end
   end
 
-  def test_load_from_file
-    store = StoreMock.new
-    store.expects(:get_series).returns(mock_series_lw)
-    store.connect
-    book = Book.new(store)
-
-    book.load_from_file!(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub"))
-
-    author = book.author
-
-    assert_equal "LouisaAlcott", author.grouping
-    assert_equal "Louisa May Alcott", author.reading_order
-    assert_equal "Alcott, Louisa May", author.sort_order
-
-    expected_descr = "This story follows the lives of the four March sisters&emdash;Meg, Jo, Beth, and Amy&emdash;" \
-      "and details their coming of age."
-
-    assert_equal expected_descr, book.description
-    assert_equal "en", book.language
-    assert_equal "Little Women: Or, Meg, Jo, Beth and Amy", book.title
-    assert_equal mock_series_lw.to_s, book.series_id.to_s
-    assert_equal 1, book.volume.to_i
-  end
-
-  def test_heading
-    store = StoreMock.new
-    store.expects(:get_series).returns(mock_series_lw)
-    store.connect
-    book = Book.new(store)
-
-    book.load_from_file!(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub"))
-
-    expected = "<b>Little Women: Or, Meg, Jo, Beth and Amy</b><br/><i>by Louisa May Alcott</i><br/>01"
-    actual = book.heading
-
-    assert_equal expected, actual
-  end
-
   private
 
   def mock_series_lw