X-Git-Url: http://jaekl.net/gitweb/?p=quanlib.git;a=blobdiff_plain;f=book.rb;h=6d90c0e2947a0a243b3480758518159e075d8d85;hp=370093da92964413102b286792ab27a4e8ed6415;hb=HEAD;hpb=061091d1fc2bb2351afc695a5fcbdbc19e48e03b diff --git a/book.rb b/book.rb index 370093d..2b93f4b 100644 --- a/book.rb +++ b/book.rb @@ -1,41 +1,69 @@ require 'nokogiri' +require 'rubygems' require 'zip' -require 'author' -require 'cover' +require_relative 'author' +require_relative 'classification' +require_relative 'cover' +require_relative 'store' class Book - def initialize(fileName) - @author = nil - @cover = nil - @path = fileName - @series = nil - @title = nil - @volume = nil + @@DC_NS_URL = 'http://purl.org/dc/elements/1.1/' + @@SERIES_AND_VOLUME_REGEX = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/ + + attr_accessor :arrived + attr_accessor :author + attr_accessor :classification_id + attr_accessor :cover + attr_accessor :description + attr_accessor :language + attr_accessor :path + attr_accessor :series_id + attr_accessor :title + attr_accessor :volume + + def initialize(store) + @store = store + end - parseFileName!(fileName) + def load_from_file!(fileName) + @path = fileName + parse_file_name!(fileName) end - def self.canHandle?(fileName) + def self.can_handle?(fileName) if nil == fileName return false end + #puts "Filename: " + fileName.to_s lowerName = fileName.downcase() if lowerName.end_with?(".epub") return true end + if lowerName.end_with?(".pdf") + return true + end + return false end - def cover - return @cover + def self.grouping_for_title(title) + result = title + + '\'",!#'.split('').each do |c| + result = result.gsub(c, '-') + end + result = result.gsub(/: */, '--') + result = result.gsub(' ', '_') + + result end - def describe + def heading result = [] if nil != @title @@ -44,12 +72,13 @@ class Book result.push('(Unknown title)') end if nil != @author - result.push(@author.to_s()) + result.push('by ' + @author.reading_order + '') end - + seriesInfo = [] - if nil != @series - seriesInfo.push(@series.to_s) + series = @store.load_series(@series_id) + if nil != series and nil != series.descr + seriesInfo.push(series.descr.to_s) end if nil != @volume seriesInfo.push(@volume.to_s) @@ -58,16 +87,29 @@ class Book result.push(seriesInfo.join(' ')) end + classification = nil + if nil != @classification_id + classification = @store.load_classification(@classification_id) + end + if nil != classification + if nil != classification.ddc + result.push('Dewey: ' + classification.ddc.to_s) + end + if nil != classification.lcc + result.push('LCC: ' + classification.lcc.to_s) + end + end + return result.join('
') end def inspect data = [] if nil != @author - data.push('author="' + @author.to_s + '"') + data.push('author="' + @author.inspect + '"') end - if nil != @series - data.push('series="' + @series + '"') + if nil != @series_id + data.push('series_id="' + @series_id.to_s() + '"') end if nil != @volume data.push('volume="' + @volume + '"') @@ -88,26 +130,34 @@ class Book return inspect() end + def title_grouping + if nil == @path + return nil + end + + return File.basename(@path, '.*') + end + protected def isUpper?(c) return /[[:upper:]]/.match(c) end protected - def massageAuthor(input) + def massage_author(input) if nil == input return nil end - result = "" + reading_order = "" input.each_char do |c| - if isUpper?(c) and (result.length > 0) - result += " " - end - result += c + if isUpper?(c) and (reading_order.length > 0) + reading_order += " " + end + reading_order += c end - - return result + + return reading_order end # Returns (series, volumeNo, titleText) @@ -123,7 +173,7 @@ class Book vol = nil first = arr[0] - matchData = (arr[0]).match(/^([A-Z]+)([0-9]+)$/) + matchData = (arr[0]).match(@@SERIES_AND_VOLUME_REGEX) if nil != matchData capt = matchData.captures series = capt[0] @@ -138,34 +188,98 @@ class Book title = arr.join(' ') + bare_title_grouping = title_grouping + .split('_') + .reject { |part| part.match(@@SERIES_AND_VOLUME_REGEX) } + .join('_') + + unless bare_title_grouping == Book.grouping_for_title(title) + puts "WARNING: title_grouping mismatch: #{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}" + end + return series, vol, title end protected - def parseFileName!(fileName) - parts = fileName.split('/') - (@series, @volume, @title) = processTitle(parts[-1]) + def parse_file_name!(file_name) + category = nil # e.g., non-fiction, fan-fiction + grouping = '' + + parts = file_name.split('/') + (series_code, @volume, @title) = processTitle(parts[-1]) if parts.length > 1 - @author = massageAuthor(parts[-2]) + grouping = parts[-2] + reading_order = massage_author(grouping) + sort_order = nil + @author = Author.new(grouping, reading_order, sort_order) + @series_id = @store.get_series(grouping, series_code) + end + if parts.length > 2 + category = parts[-3] end - if fileName.downcase.end_with?(".epub") - scanEpub!(fileName) + lc_file_name = file_name.downcase + if lc_file_name.end_with?(".epub") + scanEpub!(file_name) + elsif lc_file_name.end_with?(".pdf") + scan_pdf!(file_name) + end + + @arrived = File.ctime(file_name) + + @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*')) + + # TODO: Fix horrible hard-coded strings and paths + if ('01_nonfic' == category) && (nil == classification_id) + open(Store.unclassified_csv, 'a') do |fd| + fd.puts('"' + grouping.to_s + '","' + path + '"') + end end end - protected + protected def scanEpub!(fileName) - puts 'Scanning "' + fileName.to_s + '"...' - Zip::File.open(fileName) do |zipfile| - contXml = zipfile.read('META-INF/container.xml') - contDoc = Nokogiri::XML(contXml) - opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path'] + #puts 'Scanning "' + fileName.to_s + '"...' + begin + Zip.warn_invalid_date = false + Zip::File.open(fileName) do |zipfile| + entry = zipfile.find_entry('META-INF/container.xml') + if nil == entry + puts 'No META-INF/container.xml, skipping book ' + fileName + return + end + contXml = zipfile.read('META-INF/container.xml') + contDoc = Nokogiri::XML(contXml) + opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path'] + + scanOpf!(zipfile, opfPath) + end + rescue Zip::Error => exc + puts 'ERROR processing file "' + fileName + '":' + puts exc.message + puts exc.backtrace + end + end - scanOpf!(zipfile, opfPath) + protected + def scan_pdf!(file_name) + #puts 'Scanning "' + file_name.to_s + '"...' + + pdf_path = File.expand_path(file_name).to_s + if ! pdf_path.end_with?('.pdf') + puts 'Unexpected internal error: path "' + file_name.to_s + '" does not end with ".pdf".' + return + end + + jpeg_path = pdf_path[0..-5] + '.jpeg' + if File.file?(jpeg_path) + File.open(jpeg_path, 'r') do |is| + @cover = Cover.new(is, jpeg_path, 'image/jpeg') + end end end + protected def scanOpf!(zipfile, opfPath) coverId = nil @@ -176,22 +290,58 @@ class Book #------- # Author - creator = opfDoc.css('dc|creator', 'dc' => 'http://purl.org/dc/elements/1.1/') - if nil != creator - roleNode = creator.attr('role') - if nil != roleNode - role = roleNode.value + grouping = @author.grouping + reading_order = @author.reading_order + sort_order = @author.sort_order + + creators = opfDoc.css('dc|creator', 'dc' => @@DC_NS_URL) + if (creators.length > 0) + creator = creators[0] + if nil != creator + role = creator['opf:role'] if 'aut' == role - name = creator.children[0].content - parts = name.split(' ') - if parts.length > 1 - surname = parts[-1] - givenNames = parts[0..-2].join(' ') - @author = Author.new(surname, givenNames) - else - @author = Author.new(name, '') + reading_order = creator.content + + file_as = creator['opf:file-as'] + if nil != file_as + sort_order = file_as end end + + @author = Author.new(grouping, reading_order, sort_order) + end + end + + #--------------------------------------- + # Title + + titles = opfDoc.css('dc|title', 'dc' => @@DC_NS_URL) + if titles.length > 0 + title = titles[0] + if nil != title + @title = title.content + end + end + + #--------------------------------------- + # Description + + descrNodes = opfDoc.css('dc|description', 'dc' => @@DC_NS_URL) + if (descrNodes.length > 0) + descrNode = descrNodes[0] + if nil != descrNode + @description = descrNode.content + end + end + + #--------------------------------------- + # Language + + langNodes = opfDoc.css('dc|language', 'dc' => @@DC_NS_URL) + if (langNodes.length > 0) + langNode = langNodes[0] + if langNode + @language = langNode.content end end @@ -204,33 +354,68 @@ class Book content = m['content'] if 'calibre:series' == name - @series = content + # TODO: Dynamically create a new series? + # @series_id = content elsif 'calibre:series-index' == name @volume = content elsif 'cover' == name coverId = content + #puts 'File ' + @path + ' coverId ' + coverId end end #--------------- # Load the cover + @cover = load_cover(zipfile, opfPath, opfDoc, coverId) + end + + protected + def load_cover(zipfile, opfPath, opfDoc, coverId) coverFile = nil - if nil != coverId - items = opfDoc.css('package manifest item') - for i in items - href = i['href'] - id = i['id'] - mimeType = i['media-type'] - - if coverId == id - entry = zipfile.find_entry(href) + if nil == coverId + coverId = "cover-image" + end + + items = opfDoc.css('package manifest item') + for i in items + href = i['href'] + id = i['id'] + mimeType = i['media-type'] + + if coverId == id + entry = zipfile.find_entry(href) + + if nil == entry + # Although the epub standard requires the path to be relative + # to the base of the epub (zip), some books encountered in the + # wild have been found to use a bath relative to the location + # of the opf file. + parts = opfPath.split('/') + opfBasePath = opfPath.split('/')[0..-2].join('/') + coverPath = opfBasePath + '/' + href + entry = zipfile.find_entry(coverPath) + end + + unless entry + # Another case found in the wild: cover image is at the root, but path is '../cover.jpeg' + if href.start_with? '../' + coverPath = href[3..-1] + entry = zipfile.find_entry(coverPath) + end + end + + if nil == entry + puts 'WARNING! Cover image "' + href + '" not found in file "' + @path + '".' + return nil + else entry.get_input_stream() do |is| - @cover = Cover.new(is, href, mimeType) + return Cover.new(is, href, mimeType) end end end end + return nil end end