From: Chris Jaekl Date: Mon, 24 Jun 2024 18:42:13 +0000 (-0400) Subject: Introduces a new class Scanner and subclasses Epub and Pdf. X-Git-Url: http://jaekl.net/gitweb/?a=commitdiff_plain;h=b00ba7ecee670d7fb05a42b580acaf546d024915;p=quanlib.git Introduces a new class Scanner and subclasses Epub and Pdf. This enables us to split out some of the (overly-large) class Book into separate files, moving us toward more, smaller, classes. --- diff --git a/app/book.rb b/app/book.rb index 3449b6d..417f7a2 100644 --- a/app/book.rb +++ b/app/book.rb @@ -11,9 +11,6 @@ require_relative "store" # Encapsulates info about a book in the library class Book - @@dc_ns_url = "http://purl.org/dc/elements/1.1/" - @@series_and_volume_regex = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/ - attr_accessor( :arrived, :author, @@ -31,11 +28,6 @@ class Book @store = store end - def load_from_file!(file_name) - @path = file_name - parse_file_name!(file_name) - end - def self.can_handle?(file_name) return false if file_name.nil? @@ -124,236 +116,4 @@ class Book reading_order end - - # Returns (series, volumeNo, titleText) - def process_title(input) - return if input.nil? - - arr = input.split("_") - - series = nil - vol = nil - - first = arr[0] - match_data = first.match(@@series_and_volume_regex) - unless match_data.nil? - capt = match_data.captures - series = capt[0] - vol = capt[1] - arr.shift - end - - pos = arr[-1].rindex(".") - arr[-1] = arr[-1].slice(0, pos) unless pos.nil? - - title = arr.join(" ") - - bare_title_grouping = - title_grouping - .split("_") - .reject { |part| part.match(@@series_and_volume_regex) } - .join("_") - - unless bare_title_grouping == Book.grouping_for_title(title) - discrepancy = "#{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}" - puts "WARNING: title_grouping mismatch: #{discrepancy}" - end - - [series, vol, title] - end - - def parse_file_name!(file_name) - category = nil # e.g., non-fiction, fan-fiction - grouping = "" - - parts = file_name.split("/") - (series_code, @volume, @title) = process_title(parts[-1]) - if parts.length > 1 - grouping = parts[-2] - reading_order = massage_author(grouping) - sort_order = nil - @author = Author.new(grouping, reading_order, sort_order) - @series_id = @store.get_series(grouping, series_code) - end - category = parts[-3] if parts.length > 2 - - lc_file_name = file_name.downcase - if lc_file_name.end_with?(".epub") - scan_epub!(file_name) - elsif lc_file_name.end_with?(".pdf") - scan_pdf!(file_name) - end - - @arrived = File.ctime(file_name) - - @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, ".*")) - - # TODO: Fix horrible hard-coded strings and paths - return unless category == "00_nonFic" && classification_id.nil? - - File.open(Store.unclassified_csv, "a") do |fd| - fd.puts "#{grouping.inspect},#{path.inspect}" - end - end - - def scan_epub!(file_name) - Zip.warn_invalid_date = false - Zip::File.open(file_name) do |zipfile| - entry = zipfile.find_entry("META-INF/container.xml") - if entry.nil? - puts "No META-INF/container.xml, skipping book #{file_name.inspect}" - return nil - end - cont_xml = zipfile.read("META-INF/container.xml") - cont_doc = Nokogiri::XML(cont_xml) - opf_path = cont_doc.css("container rootfiles rootfile")[0]["full-path"] - - scan_opf!(zipfile, opf_path) - rescue Zip::Error => e - puts "ERROR processing file #{file_name.inspect}:" - puts e.message - puts e.backtrace - end - end - - def scan_pdf!(file_name) - pdf_path = File.expand_path(file_name).to_s - unless pdf_path.end_with?(".pdf") - puts "Unexpected internal error: path #{file_name.inspect} does not end with \".pdf\"." - return - end - - jpeg_path = "#{pdf_path[0..-5]}.jpeg" - - return unless File.file?(jpeg_path) - - File.open(jpeg_path, "r") do |is| - @cover = Cover.new(is, jpeg_path, "image/jpeg") - end - end - - def scan_opf!(zipfile, opf_path) - cover_id = nil - - opf_xml = zipfile.read(opf_path) - opf_doc = Nokogiri::XML(opf_xml) - - #------- - # Author - - grouping = @author.grouping - reading_order = @author.reading_order - sort_order = @author.sort_order - - creators = opf_doc.css("dc|creator", "dc" => @@dc_ns_url) - unless creators.empty? - creator = creators[0] - - return if creator.nil? - - role = creator["opf:role"] - if role == "aut" - reading_order = creator.content - - file_as = creator["opf:file-as"] - sort_order = file_as unless file_as.nil? - end - - @author = Author.new(grouping, reading_order, sort_order) - end - - #--------------------------------------- - # Title - - titles = opf_doc.css("dc|title", "dc" => @@dc_ns_url) - unless titles.empty? - title = titles[0] - @title = title.content unless title.nil? - end - - #--------------------------------------- - # Description - - descr_nodes = opf_doc.css("dc|description", "dc" => @@dc_ns_url) - unless descr_nodes.empty? - descr_node = descr_nodes[0] - @description = descr_node.content unless descr_node.nil? - end - - #--------------------------------------- - # Language - - lang_nodes = opf_doc.css("dc|language", "dc" => @@dc_ns_url) - unless lang_nodes.empty? - lang_node = lang_nodes[0] - @language = lang_node.content if lang_node - end - - #--------------------------------------- - # Other metadata: series, volume, cover - - metas = opf_doc.css("package metadata meta") - metas.each do |m| - name = m["name"] - content = m["content"] - - case name - when "calibre:series" - # TODO: Dynamically create a new series? - # @series_id = content - when "calibre:series-index" - @volume = content - when "cover" - cover_id = content - end - end - - #--------------- - # Load the cover - - @cover = load_cover(zipfile, opf_path, opf_doc, cover_id) - end - - def load_cover(zipfile, opf_path, opf_doc, cover_id) - cover_id = "cover-image" if cover_id.nil? - - items = opf_doc.css("package manifest item") - items.each do |i| - href = i["href"] - id = i["id"] - mime_type = i["media-type"] - - next unless cover_id == id - - entry = zipfile.find_entry(href) - - if entry.nil? - # Although the epub standard requires the path to be relative - # to the base of the epub (zip), some books encountered in the - # wild have been found to use a bath relative to the location - # of the opf file. - parts = opf_path.split("/") - opf_base_path = parts[0..-2].join("/") - cover_path = "#{opf_base_path}/#{href}" - entry = zipfile.find_entry(cover_path) - end - - if !entry && href.start_with?("../") - # Another case found in the wild: cover image is at the root, but path is '../cover.jpeg' - cover_path = href[3..] - entry = zipfile.find_entry(cover_path) - end - - if entry.nil? - puts "WARNING! Cover image #{href.inspect} not found in file #{@path.inspect}." - return nil - else - entry.get_input_stream do |is| - return Cover.new(is, href, mime_type) - end - end - end - - nil - end end diff --git a/app/book_loader.rb b/app/book_loader.rb index ca963ff..41ae835 100644 --- a/app/book_loader.rb +++ b/app/book_loader.rb @@ -3,25 +3,46 @@ require_relative "book" require_relative "store" +require "scanner/epub" +require "scanner/pdf" +require "scanner/scanner" + # Worker thread that pulls filenames from a queue and loads them as new books class BookLoader DONE_MARKER = "" - def initialize(config_file, queue) + def initialize(config_file, queue, store) @config_file = config_file @queue = queue + @store = store + + @scanners = [ + Scanner::Epub.new(@store), + Scanner::Pdf.new(@store), + ] + end + + def handles?(filename) + @scanners.any? do |scanner| + scanner.handles?(filename) + end + end + + def load_file(filename) + @scanners.find do |scanner| + scanner.handles?(filename) + end&.scan_file(filename) end def run - @store = Store.new(@config_file) @store.connect file = @queue.pop until file == DONE_MARKER - book = Book.new(@store) - book.load_from_file!(file) - @store.store_book(book) + book = load_file(file) + + @store.store_book(book) unless book.nil? file = @queue.pop end diff --git a/app/scanner/epub.rb b/app/scanner/epub.rb new file mode 100644 index 0000000..c4a6fb0 --- /dev/null +++ b/app/scanner/epub.rb @@ -0,0 +1,168 @@ +# frozen_string_literal: true + +require "nokogiri" +require "zip" + +require "scanner/scanner" + +module Scanner + # Scans an .epub file for information about the book it represents + class Epub < Scanner + def handles?(filename) + filename&.downcase&.end_with?(".epub") + end + + def scan_file(filename) + Zip.warn_invalid_date = false + Zip::File.open(filename) do |zipfile| + entry = zipfile.find_entry("META-INF/container.xml") + if entry.nil? + puts "No META-INF/container.xml, skipping book #{filename.inspect}" + return nil + end + cont_xml = zipfile.read("META-INF/container.xml") + cont_doc = Nokogiri::XML(cont_xml) + opf_path = cont_doc.css("container rootfiles rootfile")[0]["full-path"] + + book = scan_base_attributes(filename) + scan_opf(book, zipfile, opf_path) + rescue Zip::Error => e + puts "ERROR processing file #{filename.inspect}:" + puts e.message + puts e.backtrace + end + end + + private + + DC_NS_URL = "http://purl.org/dc/elements/1.1/" + private_constant :DC_NS_URL + + def scan_opf(book, zipfile, opf_path) + cover_id = nil + + opf_xml = zipfile.read(opf_path) + opf_doc = Nokogiri::XML(opf_xml) + + #------- + # Author + + grouping = book.author.grouping + reading_order = book.author.reading_order + sort_order = book.author.sort_order + + creators = opf_doc.css("dc|creator", "dc" => DC_NS_URL) + unless creators.empty? + creator = creators[0] + + return if creator.nil? + + role = creator["opf:role"] + if role == "aut" + reading_order = creator.content + + file_as = creator["opf:file-as"] + sort_order = file_as unless file_as.nil? + end + + book.author = Author.new(grouping, reading_order, sort_order) + end + + #--------------------------------------- + # Title + + titles = opf_doc.css("dc|title", "dc" => DC_NS_URL) + unless titles.empty? + title = titles[0] + book.title = title.content unless title.nil? + end + + #--------------------------------------- + # Description + + descr_nodes = opf_doc.css("dc|description", "dc" => DC_NS_URL) + unless descr_nodes.empty? + descr_node = descr_nodes[0] + book.description = descr_node.content unless descr_node.nil? + end + + #--------------------------------------- + # Language + + lang_nodes = opf_doc.css("dc|language", "dc" => DC_NS_URL) + unless lang_nodes.empty? + lang_node = lang_nodes[0] + book.language = lang_node.content if lang_node + end + + #--------------------------------------- + # Other metadata: series, volume, cover + + metas = opf_doc.css("package metadata meta") + metas.each do |m| + name = m["name"] + content = m["content"] + + case name + when "calibre:series" + # TODO: Dynamically create a new series? + # @series_id = content + when "calibre:series-index" + book.volume = content + when "cover" + cover_id = content + end + end + + #--------------- + # Load the cover + + book.cover = load_cover(zipfile, opf_path, opf_doc, cover_id) + + book + end + + def load_cover(zipfile, opf_path, opf_doc, cover_id) + cover_id = "cover-image" if cover_id.nil? + + items = opf_doc.css("package manifest item") + items.each do |i| + href = i["href"] + id = i["id"] + mime_type = i["media-type"] + + next unless cover_id == id + + entry = zipfile.find_entry(href) + + if entry.nil? + # Although the epub standard requires the path to be relative + # to the base of the epub (zip), some books encountered in the + # wild have been found to use a bath relative to the location + # of the opf file. + parts = opf_path.split("/") + opf_base_path = parts[0..-2].join("/") + cover_path = "#{opf_base_path}/#{href}" + entry = zipfile.find_entry(cover_path) + end + + if !entry && href.start_with?("../") + # Another case found in the wild: cover image is at the root, but path is '../cover.jpeg' + cover_path = href[3..] + entry = zipfile.find_entry(cover_path) + end + + if entry.nil? + puts "WARNING! Cover image #{href.inspect} not found in file #{@book.path.inspect}." + return nil + else + entry.get_input_stream do |is| + return Cover.new(is, href, mime_type) + end + end + end + + nil + end + end +end diff --git a/app/scanner/pdf.rb b/app/scanner/pdf.rb new file mode 100644 index 0000000..8c4a98f --- /dev/null +++ b/app/scanner/pdf.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +require "scanner/scanner" + +module Scanner + # Scans for information about a .pdf file + class Pdf < Scanner + def handles?(filename) + filename&.downcase&.end_with?(".pdf") + end + + def scan_file(filename) + pdf_path = File.expand_path(filename).to_s + unless pdf_path.end_with?(".pdf") + puts "Unexpected internal error: path #{filename.inspect} does not end with \".pdf\"." + return + end + + book = scan_base_attributes(filename) + + jpeg_path = "#{pdf_path[0..-5]}.jpeg" + + return unless File.file?(jpeg_path) + + File.open(jpeg_path, "r") do |is| + book.cover = Cover.new(is, jpeg_path, "image/jpeg") + end + + book + end + end +end diff --git a/app/scanner/scanner.rb b/app/scanner/scanner.rb new file mode 100644 index 0000000..bbeab1f --- /dev/null +++ b/app/scanner/scanner.rb @@ -0,0 +1,120 @@ +# frozen_string_literal: true + +require "book" + +module Scanner + # Abstract base class for scanners that know how to gather information about an e-book file + class Scanner + def initialize(store) + @store = store + end + + def scan_file(_filename) + raise "Not implemented (abstract base class)" + end + + def handles?(_filename) + raise "Not implemented (abstract base class)" + end + + private + + NON_FIC_FOLDER = "00_nonFic" + private_constant :NON_FIC_FOLDER + + SERIES_AND_VOLUME_REGEX = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/ + private_constant :SERIES_AND_VOLUME_REGEX + + # Returns (series, volumeNo, titleText) + def process_title(input) + return if input.nil? + + arr = input.split("_") + + series = nil + vol = nil + + first = arr[0] + match_data = first.match(SERIES_AND_VOLUME_REGEX) + unless match_data.nil? + capt = match_data.captures + series = capt[0] + vol = capt[1] + arr.shift + end + + pos = arr[-1].rindex(".") + arr[-1] = arr[-1].slice(0, pos) unless pos.nil? + + title = arr.join(" ") + + [series, vol, title] + end + + def check_title_grouping(path) + bare_title_grouping = + title_grouping(path) + .split("_") + .reject { |part| part.match(SERIES_AND_VOLUME_REGEX) } + .join("_") + + return if bare_title_grouping == Book.grouping_for_title(title) + + discrepancy = "#{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}" + puts "WARNING: title_grouping mismatch: #{discrepancy}" + end + + def massage_author(input) + return if input.nil? + + reading_order = "" + input.each_char do |c| + reading_order += " " if upper?(c) && !reading_order.empty? + reading_order += c + end + + reading_order + end + + def scan_base_attributes(filename) + book = Book.new(@store) + + book.path = filename + + category = nil # e.g., non-fiction, fan-fiction + grouping = "" + + parts = filename.split("/") + (series_code, book.volume, book.title) = process_title(parts[-1]) + if parts.length > 1 + grouping = parts[-2] + reading_order = massage_author(grouping) + sort_order = nil + book.author = Author.new(grouping, reading_order, sort_order) + book.series_id = @store.get_series(grouping, series_code) + end + category = parts[-3] if parts.length > 2 + + book.arrived = File.ctime(filename) + book.classification_id = @store.find_classification(book.author.grouping, File.basename(filename, ".*")) + + return book unless category == NON_FIC_FOLDER && classification_id.nil? + + File.open(Store.unclassified_csv, "a") do |fd| + fd.puts "#{grouping.inspect},#{path.inspect}" + end + + book + end + + def title_grouping(path) + return if path.nil? + + File.basename(path, ".*") + end + + def upper?(character) + /[[:upper:]]/.match(character) + end + end +end diff --git a/test/book_loader_test.rb b/test/book_loader_test.rb new file mode 100644 index 0000000..fd0adb5 --- /dev/null +++ b/test/book_loader_test.rb @@ -0,0 +1,74 @@ +# frozen_string_literal: true + +require "test_helper" + +require "book" +require "store_mock" + +class BookLoaderTest < Minitest::Test + def setup + @queue = Queue.new + @store = StoreMock.new + @book_loader = BookLoader.new("/path/to/config.ini", @queue, @store) + end + + def test_that_it_can_handle_epub_and_pdf_files + %w[epub pdf].each do |extension| + assert_equal true, @book_loader.handles?("sample.#{extension}") + end + end + + def test_that_it_cannot_handle_mobi_html_txt_doc_zip_rtf_nor_rar + %w[doc html mobi rar rtf txt zip].each do |extension| + assert_equal false, @book_loader.handles?("sample.#{extension}") + end + end + + def test_load_from_file + @store.expects(:get_series).returns(mock_series_lw) + @store.connect + + book = @book_loader.load_file(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub")) + + author = book.author + + assert_equal "LouisaAlcott", author.grouping + assert_equal "Louisa May Alcott", author.reading_order + assert_equal "Alcott, Louisa May", author.sort_order + + expected_descr = "This story follows the lives of the four March sisters&emdash;Meg, Jo, Beth, and Amy&emdash;" \ + "and details their coming of age." + + assert_equal expected_descr, book.description + assert_equal "en", book.language + assert_equal "Little Women: Or, Meg, Jo, Beth and Amy", book.title + assert_equal mock_series_lw.to_s, book.series_id.to_s + assert_equal 1, book.volume.to_i + end + + def test_heading + @store.expects(:get_series).returns(mock_series_lw) + @store.connect + + book = @book_loader.load_file(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub")) + + expected = "Little Women: Or, Meg, Jo, Beth and Amy
by Louisa May Alcott
01" + actual = book.heading + + assert_equal expected, actual + end + + private + + def mock_series_lw + id = 1 + series = Series.new(id) + series.age = "ya" + series.genre = "romance" + series.grouping = "LoisaAlcott" + series.code = "LW" + series.descr = "Little Women" + + series + end +end diff --git a/test/book_test.rb b/test/book_test.rb index 034ae04..a15cc84 100644 --- a/test/book_test.rb +++ b/test/book_test.rb @@ -31,44 +31,6 @@ class BookTest < Minitest::Test end end - def test_load_from_file - store = StoreMock.new - store.expects(:get_series).returns(mock_series_lw) - store.connect - book = Book.new(store) - - book.load_from_file!(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub")) - - author = book.author - - assert_equal "LouisaAlcott", author.grouping - assert_equal "Louisa May Alcott", author.reading_order - assert_equal "Alcott, Louisa May", author.sort_order - - expected_descr = "This story follows the lives of the four March sisters&emdash;Meg, Jo, Beth, and Amy&emdash;" \ - "and details their coming of age." - - assert_equal expected_descr, book.description - assert_equal "en", book.language - assert_equal "Little Women: Or, Meg, Jo, Beth and Amy", book.title - assert_equal mock_series_lw.to_s, book.series_id.to_s - assert_equal 1, book.volume.to_i - end - - def test_heading - store = StoreMock.new - store.expects(:get_series).returns(mock_series_lw) - store.connect - book = Book.new(store) - - book.load_from_file!(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub")) - - expected = "Little Women: Or, Meg, Jo, Beth and Amy
by Louisa May Alcott
01" - actual = book.heading - - assert_equal expected, actual - end - private def mock_series_lw