X-Git-Url: http://jaekl.net/gitweb/?p=quanlib.git;a=blobdiff_plain;f=book.rb;h=cd14ab690b6164c14c7f842b4755e85bb87d231e;hp=f59b61a66ea44d17a86e7d999e34496b84fbea99;hb=HEAD;hpb=e9d890ae4d346ce3abe93a8db94d3a3ddf9819d9
diff --git a/book.rb b/book.rb
index f59b61a..2b93f4b 100644
--- a/book.rb
+++ b/book.rb
@@ -1,23 +1,38 @@
require 'nokogiri'
+require 'rubygems'
require 'zip'
-require 'author'
-require 'cover'
+require_relative 'author'
+require_relative 'classification'
+require_relative 'cover'
+require_relative 'store'
class Book
- def initialize(fileName)
- @author = nil
- @cover = nil
- @path = fileName
- @series = nil
- @title = nil
- @volume = nil
+ @@DC_NS_URL = 'http://purl.org/dc/elements/1.1/'
+ @@SERIES_AND_VOLUME_REGEX = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/
+
+ attr_accessor :arrived
+ attr_accessor :author
+ attr_accessor :classification_id
+ attr_accessor :cover
+ attr_accessor :description
+ attr_accessor :language
+ attr_accessor :path
+ attr_accessor :series_id
+ attr_accessor :title
+ attr_accessor :volume
+
+ def initialize(store)
+ @store = store
+ end
- parseFileName!(fileName)
+ def load_from_file!(fileName)
+ @path = fileName
+ parse_file_name!(fileName)
end
- def self.canHandle?(fileName)
+ def self.can_handle?(fileName)
if nil == fileName
return false
end
@@ -29,14 +44,26 @@ class Book
return true
end
+ if lowerName.end_with?(".pdf")
+ return true
+ end
+
return false
end
- def cover
- return @cover
+ def self.grouping_for_title(title)
+ result = title
+
+ '\'",!#'.split('').each do |c|
+ result = result.gsub(c, '-')
+ end
+ result = result.gsub(/: */, '--')
+ result = result.gsub(' ', '_')
+
+ result
end
- def describe
+ def heading
result = []
if nil != @title
@@ -45,12 +72,13 @@ class Book
result.push('(Unknown title)')
end
if nil != @author
- result.push(@author.to_s())
+ result.push('by ' + @author.reading_order + '')
end
-
+
seriesInfo = []
- if nil != @series
- seriesInfo.push(@series.to_s)
+ series = @store.load_series(@series_id)
+ if nil != series and nil != series.descr
+ seriesInfo.push(series.descr.to_s)
end
if nil != @volume
seriesInfo.push(@volume.to_s)
@@ -59,16 +87,29 @@ class Book
result.push(seriesInfo.join(' '))
end
+ classification = nil
+ if nil != @classification_id
+ classification = @store.load_classification(@classification_id)
+ end
+ if nil != classification
+ if nil != classification.ddc
+ result.push('Dewey: ' + classification.ddc.to_s)
+ end
+ if nil != classification.lcc
+ result.push('LCC: ' + classification.lcc.to_s)
+ end
+ end
+
return result.join('
')
end
def inspect
data = []
if nil != @author
- data.push('author="' + @author.to_s + '"')
+ data.push('author="' + @author.inspect + '"')
end
- if nil != @series
- data.push('series="' + @series + '"')
+ if nil != @series_id
+ data.push('series_id="' + @series_id.to_s() + '"')
end
if nil != @volume
data.push('volume="' + @volume + '"')
@@ -85,34 +126,38 @@ class Book
return '(Book:' + data.join(',') + ')'
end
- def path
- @path
- end
-
def to_s
return inspect()
end
+ def title_grouping
+ if nil == @path
+ return nil
+ end
+
+ return File.basename(@path, '.*')
+ end
+
protected
def isUpper?(c)
return /[[:upper:]]/.match(c)
end
protected
- def massageAuthor(input)
+ def massage_author(input)
if nil == input
return nil
end
- result = ""
+ reading_order = ""
input.each_char do |c|
- if isUpper?(c) and (result.length > 0)
- result += " "
- end
- result += c
+ if isUpper?(c) and (reading_order.length > 0)
+ reading_order += " "
+ end
+ reading_order += c
end
-
- return result
+
+ return reading_order
end
# Returns (series, volumeNo, titleText)
@@ -128,7 +173,7 @@ class Book
vol = nil
first = arr[0]
- matchData = (arr[0]).match(/^([A-Z]+)([0-9]+)$/)
+ matchData = (arr[0]).match(@@SERIES_AND_VOLUME_REGEX)
if nil != matchData
capt = matchData.captures
series = capt[0]
@@ -143,29 +188,64 @@ class Book
title = arr.join(' ')
+ bare_title_grouping = title_grouping
+ .split('_')
+ .reject { |part| part.match(@@SERIES_AND_VOLUME_REGEX) }
+ .join('_')
+
+ unless bare_title_grouping == Book.grouping_for_title(title)
+ puts "WARNING: title_grouping mismatch: #{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}"
+ end
+
return series, vol, title
end
protected
- def parseFileName!(fileName)
- parts = fileName.split('/')
- (@series, @volume, @title) = processTitle(parts[-1])
+ def parse_file_name!(file_name)
+ category = nil # e.g., non-fiction, fan-fiction
+ grouping = ''
+
+ parts = file_name.split('/')
+ (series_code, @volume, @title) = processTitle(parts[-1])
if parts.length > 1
- @author = massageAuthor(parts[-2])
+ grouping = parts[-2]
+ reading_order = massage_author(grouping)
+ sort_order = nil
+ @author = Author.new(grouping, reading_order, sort_order)
+ @series_id = @store.get_series(grouping, series_code)
+ end
+ if parts.length > 2
+ category = parts[-3]
end
- if fileName.downcase.end_with?(".epub")
- scanEpub!(fileName)
+ lc_file_name = file_name.downcase
+ if lc_file_name.end_with?(".epub")
+ scanEpub!(file_name)
+ elsif lc_file_name.end_with?(".pdf")
+ scan_pdf!(file_name)
+ end
+
+ @arrived = File.ctime(file_name)
+
+ @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*'))
+
+ # TODO: Fix horrible hard-coded strings and paths
+ if ('01_nonfic' == category) && (nil == classification_id)
+ open(Store.unclassified_csv, 'a') do |fd|
+ fd.puts('"' + grouping.to_s + '","' + path + '"')
+ end
end
end
- protected
+ protected
def scanEpub!(fileName)
#puts 'Scanning "' + fileName.to_s + '"...'
begin
+ Zip.warn_invalid_date = false
Zip::File.open(fileName) do |zipfile|
entry = zipfile.find_entry('META-INF/container.xml')
if nil == entry
+ puts 'No META-INF/container.xml, skipping book ' + fileName
return
end
contXml = zipfile.read('META-INF/container.xml')
@@ -181,6 +261,25 @@ class Book
end
end
+ protected
+ def scan_pdf!(file_name)
+ #puts 'Scanning "' + file_name.to_s + '"...'
+
+ pdf_path = File.expand_path(file_name).to_s
+ if ! pdf_path.end_with?('.pdf')
+ puts 'Unexpected internal error: path "' + file_name.to_s + '" does not end with ".pdf".'
+ return
+ end
+
+ jpeg_path = pdf_path[0..-5] + '.jpeg'
+ if File.file?(jpeg_path)
+ File.open(jpeg_path, 'r') do |is|
+ @cover = Cover.new(is, jpeg_path, 'image/jpeg')
+ end
+ end
+ end
+
+
protected
def scanOpf!(zipfile, opfPath)
coverId = nil
@@ -191,22 +290,58 @@ class Book
#-------
# Author
- creator = opfDoc.css('dc|creator', 'dc' => 'http://purl.org/dc/elements/1.1/')
- if (nil != creator) and (creator.length > 0)
- roleNode = creator.attr('role')
- if nil != roleNode
- role = roleNode.value
- if ('aut' == role) and (creator.children.length > 0) and (nil != creator.children[0])
- name = creator.children[0].content
- parts = name.split(' ')
- if parts.length > 1
- surname = parts[-1]
- givenNames = parts[0..-2].join(' ')
- @author = Author.new(surname, givenNames)
- else
- @author = Author.new(name, '')
+ grouping = @author.grouping
+ reading_order = @author.reading_order
+ sort_order = @author.sort_order
+
+ creators = opfDoc.css('dc|creator', 'dc' => @@DC_NS_URL)
+ if (creators.length > 0)
+ creator = creators[0]
+ if nil != creator
+ role = creator['opf:role']
+ if 'aut' == role
+ reading_order = creator.content
+
+ file_as = creator['opf:file-as']
+ if nil != file_as
+ sort_order = file_as
end
end
+
+ @author = Author.new(grouping, reading_order, sort_order)
+ end
+ end
+
+ #---------------------------------------
+ # Title
+
+ titles = opfDoc.css('dc|title', 'dc' => @@DC_NS_URL)
+ if titles.length > 0
+ title = titles[0]
+ if nil != title
+ @title = title.content
+ end
+ end
+
+ #---------------------------------------
+ # Description
+
+ descrNodes = opfDoc.css('dc|description', 'dc' => @@DC_NS_URL)
+ if (descrNodes.length > 0)
+ descrNode = descrNodes[0]
+ if nil != descrNode
+ @description = descrNode.content
+ end
+ end
+
+ #---------------------------------------
+ # Language
+
+ langNodes = opfDoc.css('dc|language', 'dc' => @@DC_NS_URL)
+ if (langNodes.length > 0)
+ langNode = langNodes[0]
+ if langNode
+ @language = langNode.content
end
end
@@ -219,22 +354,24 @@ class Book
content = m['content']
if 'calibre:series' == name
- @series = content
+ # TODO: Dynamically create a new series?
+ # @series_id = content
elsif 'calibre:series-index' == name
@volume = content
elsif 'cover' == name
coverId = content
+ #puts 'File ' + @path + ' coverId ' + coverId
end
end
#---------------
# Load the cover
- @cover = loadCover(zipfile, opfPath, opfDoc, coverId)
+ @cover = load_cover(zipfile, opfPath, opfDoc, coverId)
end
protected
- def loadCover(zipfile, opfPath, opfDoc, coverId)
+ def load_cover(zipfile, opfPath, opfDoc, coverId)
coverFile = nil
if nil == coverId
coverId = "cover-image"
@@ -250,9 +387,9 @@ class Book
entry = zipfile.find_entry(href)
if nil == entry
- # Although the epub standard requires the path to be relative
+ # Although the epub standard requires the path to be relative
# to the base of the epub (zip), some books encountered in the
- # wild have been found to use a bath relative to the location
+ # wild have been found to use a bath relative to the location
# of the opf file.
parts = opfPath.split('/')
opfBasePath = opfPath.split('/')[0..-2].join('/')
@@ -260,6 +397,14 @@ class Book
entry = zipfile.find_entry(coverPath)
end
+ unless entry
+ # Another case found in the wild: cover image is at the root, but path is '../cover.jpeg'
+ if href.start_with? '../'
+ coverPath = href[3..-1]
+ entry = zipfile.find_entry(coverPath)
+ end
+ end
+
if nil == entry
puts 'WARNING! Cover image "' + href + '" not found in file "' + @path + '".'
return nil