# Encapsulates info about a book in the library
class Book
- @@dc_ns_url = "http://purl.org/dc/elements/1.1/"
- @@series_and_volume_regex = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/
-
attr_accessor(
:arrived,
:author,
@store = store
end
- def load_from_file!(file_name)
- @path = file_name
- parse_file_name!(file_name)
- end
-
def self.can_handle?(file_name)
return false if file_name.nil?
reading_order
end
-
- # Returns (series, volumeNo, titleText)
- def process_title(input)
- return if input.nil?
-
- arr = input.split("_")
-
- series = nil
- vol = nil
-
- first = arr[0]
- match_data = first.match(@@series_and_volume_regex)
- unless match_data.nil?
- capt = match_data.captures
- series = capt[0]
- vol = capt[1]
- arr.shift
- end
-
- pos = arr[-1].rindex(".")
- arr[-1] = arr[-1].slice(0, pos) unless pos.nil?
-
- title = arr.join(" ")
-
- bare_title_grouping =
- title_grouping
- .split("_")
- .reject { |part| part.match(@@series_and_volume_regex) }
- .join("_")
-
- unless bare_title_grouping == Book.grouping_for_title(title)
- discrepancy = "#{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}"
- puts "WARNING: title_grouping mismatch: #{discrepancy}"
- end
-
- [series, vol, title]
- end
-
- def parse_file_name!(file_name)
- category = nil # e.g., non-fiction, fan-fiction
- grouping = ""
-
- parts = file_name.split("/")
- (series_code, @volume, @title) = process_title(parts[-1])
- if parts.length > 1
- grouping = parts[-2]
- reading_order = massage_author(grouping)
- sort_order = nil
- @author = Author.new(grouping, reading_order, sort_order)
- @series_id = @store.get_series(grouping, series_code)
- end
- category = parts[-3] if parts.length > 2
-
- lc_file_name = file_name.downcase
- if lc_file_name.end_with?(".epub")
- scan_epub!(file_name)
- elsif lc_file_name.end_with?(".pdf")
- scan_pdf!(file_name)
- end
-
- @arrived = File.ctime(file_name)
-
- @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, ".*"))
-
- # TODO: Fix horrible hard-coded strings and paths
- return unless category == "00_nonFic" && classification_id.nil?
-
- File.open(Store.unclassified_csv, "a") do |fd|
- fd.puts "#{grouping.inspect},#{path.inspect}"
- end
- end
-
- def scan_epub!(file_name)
- Zip.warn_invalid_date = false
- Zip::File.open(file_name) do |zipfile|
- entry = zipfile.find_entry("META-INF/container.xml")
- if entry.nil?
- puts "No META-INF/container.xml, skipping book #{file_name.inspect}"
- return nil
- end
- cont_xml = zipfile.read("META-INF/container.xml")
- cont_doc = Nokogiri::XML(cont_xml)
- opf_path = cont_doc.css("container rootfiles rootfile")[0]["full-path"]
-
- scan_opf!(zipfile, opf_path)
- rescue Zip::Error => e
- puts "ERROR processing file #{file_name.inspect}:"
- puts e.message
- puts e.backtrace
- end
- end
-
- def scan_pdf!(file_name)
- pdf_path = File.expand_path(file_name).to_s
- unless pdf_path.end_with?(".pdf")
- puts "Unexpected internal error: path #{file_name.inspect} does not end with \".pdf\"."
- return
- end
-
- jpeg_path = "#{pdf_path[0..-5]}.jpeg"
-
- return unless File.file?(jpeg_path)
-
- File.open(jpeg_path, "r") do |is|
- @cover = Cover.new(is, jpeg_path, "image/jpeg")
- end
- end
-
- def scan_opf!(zipfile, opf_path)
- cover_id = nil
-
- opf_xml = zipfile.read(opf_path)
- opf_doc = Nokogiri::XML(opf_xml)
-
- #-------
- # Author
-
- grouping = @author.grouping
- reading_order = @author.reading_order
- sort_order = @author.sort_order
-
- creators = opf_doc.css("dc|creator", "dc" => @@dc_ns_url)
- unless creators.empty?
- creator = creators[0]
-
- return if creator.nil?
-
- role = creator["opf:role"]
- if role == "aut"
- reading_order = creator.content
-
- file_as = creator["opf:file-as"]
- sort_order = file_as unless file_as.nil?
- end
-
- @author = Author.new(grouping, reading_order, sort_order)
- end
-
- #---------------------------------------
- # Title
-
- titles = opf_doc.css("dc|title", "dc" => @@dc_ns_url)
- unless titles.empty?
- title = titles[0]
- @title = title.content unless title.nil?
- end
-
- #---------------------------------------
- # Description
-
- descr_nodes = opf_doc.css("dc|description", "dc" => @@dc_ns_url)
- unless descr_nodes.empty?
- descr_node = descr_nodes[0]
- @description = descr_node.content unless descr_node.nil?
- end
-
- #---------------------------------------
- # Language
-
- lang_nodes = opf_doc.css("dc|language", "dc" => @@dc_ns_url)
- unless lang_nodes.empty?
- lang_node = lang_nodes[0]
- @language = lang_node.content if lang_node
- end
-
- #---------------------------------------
- # Other metadata: series, volume, cover
-
- metas = opf_doc.css("package metadata meta")
- metas.each do |m|
- name = m["name"]
- content = m["content"]
-
- case name
- when "calibre:series"
- # TODO: Dynamically create a new series?
- # @series_id = content
- when "calibre:series-index"
- @volume = content
- when "cover"
- cover_id = content
- end
- end
-
- #---------------
- # Load the cover
-
- @cover = load_cover(zipfile, opf_path, opf_doc, cover_id)
- end
-
- def load_cover(zipfile, opf_path, opf_doc, cover_id)
- cover_id = "cover-image" if cover_id.nil?
-
- items = opf_doc.css("package manifest item")
- items.each do |i|
- href = i["href"]
- id = i["id"]
- mime_type = i["media-type"]
-
- next unless cover_id == id
-
- entry = zipfile.find_entry(href)
-
- if entry.nil?
- # Although the epub standard requires the path to be relative
- # to the base of the epub (zip), some books encountered in the
- # wild have been found to use a bath relative to the location
- # of the opf file.
- parts = opf_path.split("/")
- opf_base_path = parts[0..-2].join("/")
- cover_path = "#{opf_base_path}/#{href}"
- entry = zipfile.find_entry(cover_path)
- end
-
- if !entry && href.start_with?("../")
- # Another case found in the wild: cover image is at the root, but path is '../cover.jpeg'
- cover_path = href[3..]
- entry = zipfile.find_entry(cover_path)
- end
-
- if entry.nil?
- puts "WARNING! Cover image #{href.inspect} not found in file #{@path.inspect}."
- return nil
- else
- entry.get_input_stream do |is|
- return Cover.new(is, href, mime_type)
- end
- end
- end
-
- nil
- end
end
require_relative "book"
require_relative "store"
+require "scanner/epub"
+require "scanner/pdf"
+require "scanner/scanner"
+
# Worker thread that pulls filenames from a queue and loads them as new books
class BookLoader
DONE_MARKER = "<END>"
- def initialize(config_file, queue)
+ def initialize(config_file, queue, store)
@config_file = config_file
@queue = queue
+ @store = store
+
+ @scanners = [
+ Scanner::Epub.new(@store),
+ Scanner::Pdf.new(@store),
+ ]
+ end
+
+ def handles?(filename)
+ @scanners.any? do |scanner|
+ scanner.handles?(filename)
+ end
+ end
+
+ def load_file(filename)
+ @scanners.find do |scanner|
+ scanner.handles?(filename)
+ end&.scan_file(filename)
end
def run
- @store = Store.new(@config_file)
@store.connect
file = @queue.pop
until file == DONE_MARKER
- book = Book.new(@store)
- book.load_from_file!(file)
- @store.store_book(book)
+ book = load_file(file)
+
+ @store.store_book(book) unless book.nil?
file = @queue.pop
end
--- /dev/null
+# frozen_string_literal: true
+
+require "nokogiri"
+require "zip"
+
+require "scanner/scanner"
+
+module Scanner
+ # Scans an .epub file for information about the book it represents
+ class Epub < Scanner
+ def handles?(filename)
+ filename&.downcase&.end_with?(".epub")
+ end
+
+ def scan_file(filename)
+ Zip.warn_invalid_date = false
+ Zip::File.open(filename) do |zipfile|
+ entry = zipfile.find_entry("META-INF/container.xml")
+ if entry.nil?
+ puts "No META-INF/container.xml, skipping book #{filename.inspect}"
+ return nil
+ end
+ cont_xml = zipfile.read("META-INF/container.xml")
+ cont_doc = Nokogiri::XML(cont_xml)
+ opf_path = cont_doc.css("container rootfiles rootfile")[0]["full-path"]
+
+ book = scan_base_attributes(filename)
+ scan_opf(book, zipfile, opf_path)
+ rescue Zip::Error => e
+ puts "ERROR processing file #{filename.inspect}:"
+ puts e.message
+ puts e.backtrace
+ end
+ end
+
+ private
+
+ DC_NS_URL = "http://purl.org/dc/elements/1.1/"
+ private_constant :DC_NS_URL
+
+ def scan_opf(book, zipfile, opf_path)
+ cover_id = nil
+
+ opf_xml = zipfile.read(opf_path)
+ opf_doc = Nokogiri::XML(opf_xml)
+
+ #-------
+ # Author
+
+ grouping = book.author.grouping
+ reading_order = book.author.reading_order
+ sort_order = book.author.sort_order
+
+ creators = opf_doc.css("dc|creator", "dc" => DC_NS_URL)
+ unless creators.empty?
+ creator = creators[0]
+
+ return if creator.nil?
+
+ role = creator["opf:role"]
+ if role == "aut"
+ reading_order = creator.content
+
+ file_as = creator["opf:file-as"]
+ sort_order = file_as unless file_as.nil?
+ end
+
+ book.author = Author.new(grouping, reading_order, sort_order)
+ end
+
+ #---------------------------------------
+ # Title
+
+ titles = opf_doc.css("dc|title", "dc" => DC_NS_URL)
+ unless titles.empty?
+ title = titles[0]
+ book.title = title.content unless title.nil?
+ end
+
+ #---------------------------------------
+ # Description
+
+ descr_nodes = opf_doc.css("dc|description", "dc" => DC_NS_URL)
+ unless descr_nodes.empty?
+ descr_node = descr_nodes[0]
+ book.description = descr_node.content unless descr_node.nil?
+ end
+
+ #---------------------------------------
+ # Language
+
+ lang_nodes = opf_doc.css("dc|language", "dc" => DC_NS_URL)
+ unless lang_nodes.empty?
+ lang_node = lang_nodes[0]
+ book.language = lang_node.content if lang_node
+ end
+
+ #---------------------------------------
+ # Other metadata: series, volume, cover
+
+ metas = opf_doc.css("package metadata meta")
+ metas.each do |m|
+ name = m["name"]
+ content = m["content"]
+
+ case name
+ when "calibre:series"
+ # TODO: Dynamically create a new series?
+ # @series_id = content
+ when "calibre:series-index"
+ book.volume = content
+ when "cover"
+ cover_id = content
+ end
+ end
+
+ #---------------
+ # Load the cover
+
+ book.cover = load_cover(zipfile, opf_path, opf_doc, cover_id)
+
+ book
+ end
+
+ def load_cover(zipfile, opf_path, opf_doc, cover_id)
+ cover_id = "cover-image" if cover_id.nil?
+
+ items = opf_doc.css("package manifest item")
+ items.each do |i|
+ href = i["href"]
+ id = i["id"]
+ mime_type = i["media-type"]
+
+ next unless cover_id == id
+
+ entry = zipfile.find_entry(href)
+
+ if entry.nil?
+ # Although the epub standard requires the path to be relative
+ # to the base of the epub (zip), some books encountered in the
+ # wild have been found to use a bath relative to the location
+ # of the opf file.
+ parts = opf_path.split("/")
+ opf_base_path = parts[0..-2].join("/")
+ cover_path = "#{opf_base_path}/#{href}"
+ entry = zipfile.find_entry(cover_path)
+ end
+
+ if !entry && href.start_with?("../")
+ # Another case found in the wild: cover image is at the root, but path is '../cover.jpeg'
+ cover_path = href[3..]
+ entry = zipfile.find_entry(cover_path)
+ end
+
+ if entry.nil?
+ puts "WARNING! Cover image #{href.inspect} not found in file #{@book.path.inspect}."
+ return nil
+ else
+ entry.get_input_stream do |is|
+ return Cover.new(is, href, mime_type)
+ end
+ end
+ end
+
+ nil
+ end
+ end
+end
--- /dev/null
+# frozen_string_literal: true
+
+require "scanner/scanner"
+
+module Scanner
+ # Scans for information about a .pdf file
+ class Pdf < Scanner
+ def handles?(filename)
+ filename&.downcase&.end_with?(".pdf")
+ end
+
+ def scan_file(filename)
+ pdf_path = File.expand_path(filename).to_s
+ unless pdf_path.end_with?(".pdf")
+ puts "Unexpected internal error: path #{filename.inspect} does not end with \".pdf\"."
+ return
+ end
+
+ book = scan_base_attributes(filename)
+
+ jpeg_path = "#{pdf_path[0..-5]}.jpeg"
+
+ return unless File.file?(jpeg_path)
+
+ File.open(jpeg_path, "r") do |is|
+ book.cover = Cover.new(is, jpeg_path, "image/jpeg")
+ end
+
+ book
+ end
+ end
+end
--- /dev/null
+# frozen_string_literal: true
+
+require "book"
+
+module Scanner
+ # Abstract base class for scanners that know how to gather information about an e-book file
+ class Scanner
+ def initialize(store)
+ @store = store
+ end
+
+ def scan_file(_filename)
+ raise "Not implemented (abstract base class)"
+ end
+
+ def handles?(_filename)
+ raise "Not implemented (abstract base class)"
+ end
+
+ private
+
+ NON_FIC_FOLDER = "00_nonFic"
+ private_constant :NON_FIC_FOLDER
+
+ SERIES_AND_VOLUME_REGEX = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/
+ private_constant :SERIES_AND_VOLUME_REGEX
+
+ # Returns (series, volumeNo, titleText)
+ def process_title(input)
+ return if input.nil?
+
+ arr = input.split("_")
+
+ series = nil
+ vol = nil
+
+ first = arr[0]
+ match_data = first.match(SERIES_AND_VOLUME_REGEX)
+ unless match_data.nil?
+ capt = match_data.captures
+ series = capt[0]
+ vol = capt[1]
+ arr.shift
+ end
+
+ pos = arr[-1].rindex(".")
+ arr[-1] = arr[-1].slice(0, pos) unless pos.nil?
+
+ title = arr.join(" ")
+
+ [series, vol, title]
+ end
+
+ def check_title_grouping(path)
+ bare_title_grouping =
+ title_grouping(path)
+ .split("_")
+ .reject { |part| part.match(SERIES_AND_VOLUME_REGEX) }
+ .join("_")
+
+ return if bare_title_grouping == Book.grouping_for_title(title)
+
+ discrepancy = "#{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}"
+ puts "WARNING: title_grouping mismatch: #{discrepancy}"
+ end
+
+ def massage_author(input)
+ return if input.nil?
+
+ reading_order = ""
+ input.each_char do |c|
+ reading_order += " " if upper?(c) && !reading_order.empty?
+ reading_order += c
+ end
+
+ reading_order
+ end
+
+ def scan_base_attributes(filename)
+ book = Book.new(@store)
+
+ book.path = filename
+
+ category = nil # e.g., non-fiction, fan-fiction
+ grouping = ""
+
+ parts = filename.split("/")
+ (series_code, book.volume, book.title) = process_title(parts[-1])
+ if parts.length > 1
+ grouping = parts[-2]
+ reading_order = massage_author(grouping)
+ sort_order = nil
+ book.author = Author.new(grouping, reading_order, sort_order)
+ book.series_id = @store.get_series(grouping, series_code)
+ end
+ category = parts[-3] if parts.length > 2
+
+ book.arrived = File.ctime(filename)
+ book.classification_id = @store.find_classification(book.author.grouping, File.basename(filename, ".*"))
+
+ return book unless category == NON_FIC_FOLDER && classification_id.nil?
+
+ File.open(Store.unclassified_csv, "a") do |fd|
+ fd.puts "#{grouping.inspect},#{path.inspect}"
+ end
+
+ book
+ end
+
+ def title_grouping(path)
+ return if path.nil?
+
+ File.basename(path, ".*")
+ end
+
+ def upper?(character)
+ /[[:upper:]]/.match(character)
+ end
+ end
+end
--- /dev/null
+# frozen_string_literal: true
+
+require "test_helper"
+
+require "book"
+require "store_mock"
+
+class BookLoaderTest < Minitest::Test
+ def setup
+ @queue = Queue.new
+ @store = StoreMock.new
+ @book_loader = BookLoader.new("/path/to/config.ini", @queue, @store)
+ end
+
+ def test_that_it_can_handle_epub_and_pdf_files
+ %w[epub pdf].each do |extension|
+ assert_equal true, @book_loader.handles?("sample.#{extension}")
+ end
+ end
+
+ def test_that_it_cannot_handle_mobi_html_txt_doc_zip_rtf_nor_rar
+ %w[doc html mobi rar rtf txt zip].each do |extension|
+ assert_equal false, @book_loader.handles?("sample.#{extension}")
+ end
+ end
+
+ def test_load_from_file
+ @store.expects(:get_series).returns(mock_series_lw)
+ @store.connect
+
+ book = @book_loader.load_file(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub"))
+
+ author = book.author
+
+ assert_equal "LouisaAlcott", author.grouping
+ assert_equal "Louisa May Alcott", author.reading_order
+ assert_equal "Alcott, Louisa May", author.sort_order
+
+ expected_descr = "This story follows the lives of the four March sisters&emdash;Meg, Jo, Beth, and Amy&emdash;" \
+ "and details their coming of age."
+
+ assert_equal expected_descr, book.description
+ assert_equal "en", book.language
+ assert_equal "Little Women: Or, Meg, Jo, Beth and Amy", book.title
+ assert_equal mock_series_lw.to_s, book.series_id.to_s
+ assert_equal 1, book.volume.to_i
+ end
+
+ def test_heading
+ @store.expects(:get_series).returns(mock_series_lw)
+ @store.connect
+
+ book = @book_loader.load_file(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub"))
+
+ expected = "<b>Little Women: Or, Meg, Jo, Beth and Amy</b><br/><i>by Louisa May Alcott</i><br/>01"
+ actual = book.heading
+
+ assert_equal expected, actual
+ end
+
+ private
+
+ def mock_series_lw
+ id = 1
+ series = Series.new(id)
+ series.age = "ya"
+ series.genre = "romance"
+ series.grouping = "LoisaAlcott"
+ series.code = "LW"
+ series.descr = "Little Women"
+
+ series
+ end
+end
end
end
- def test_load_from_file
- store = StoreMock.new
- store.expects(:get_series).returns(mock_series_lw)
- store.connect
- book = Book.new(store)
-
- book.load_from_file!(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub"))
-
- author = book.author
-
- assert_equal "LouisaAlcott", author.grouping
- assert_equal "Louisa May Alcott", author.reading_order
- assert_equal "Alcott, Louisa May", author.sort_order
-
- expected_descr = "This story follows the lives of the four March sisters&emdash;Meg, Jo, Beth, and Amy&emdash;" \
- "and details their coming of age."
-
- assert_equal expected_descr, book.description
- assert_equal "en", book.language
- assert_equal "Little Women: Or, Meg, Jo, Beth and Amy", book.title
- assert_equal mock_series_lw.to_s, book.series_id.to_s
- assert_equal 1, book.volume.to_i
- end
-
- def test_heading
- store = StoreMock.new
- store.expects(:get_series).returns(mock_series_lw)
- store.connect
- book = Book.new(store)
-
- book.load_from_file!(File.join(TestHelper::SAMPLE_DATA_PATH, "LouisaAlcott", "LW01_Little_Women.epub"))
-
- expected = "<b>Little Women: Or, Meg, Jo, Beth and Amy</b><br/><i>by Louisa May Alcott</i><br/>01"
- actual = book.heading
-
- assert_equal expected, actual
- end
-
private
def mock_series_lw