book.rb

   1
   2 require 'nokogiri'
   3 require 'rubygems'
   4 require 'zip'
   5
   6 require_relative 'author'
   7 require_relative 'classification'
   8 require_relative 'cover'
   9 require_relative 'store'
  10
  11 class Book
  12   @@DC_NS_URL = 'http://purl.org/dc/elements/1.1/'
  13   @@SERIES_AND_VOLUME_REGEX = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/
  14
  15   attr_accessor :author
  16   attr_accessor :classification_id
  17   attr_accessor :cover
  18   attr_accessor :description
  19   attr_accessor :language
  20   attr_accessor :path
  21   attr_accessor :series_id
  22   attr_accessor :title
  23   attr_accessor :volume
  24
  25   def initialize(store)
  26     @store = store
  27   end
  28
  29   def load_from_file!(fileName)
  30     @path = fileName
  31     parse_file_name!(fileName)
  32   end
  33
  34   def self.can_handle?(fileName)
  35     if nil == fileName
  36       return false
  37     end
  38
  39     #puts "Filename:  " + fileName.to_s
  40     lowerName = fileName.downcase()
  41
  42     if lowerName.end_with?(".epub")
  43       return true
  44     end
  45
  46     if lowerName.end_with?(".pdf")
  47       return true
  48     end
  49
  50     return false
  51   end
  52
  53   def self.grouping_for_title(title)
  54     result = title
  55
  56     '\'",!#'.split('').each do |c|
  57       result = result.gsub(c, '-')
  58     end
  59     result = result.gsub(/: */, '--')
  60     result = result.gsub(' ', '_')
  61
  62     result
  63   end
  64
  65   def heading
  66     result = []
  67
  68     if nil != @title
  69       result.push('<b>' + @title + '</b>')
  70     else
  71       result.push('<i>(Unknown title)</i>')
  72     end
  73     if nil != @author
  74       result.push('<i>by ' + @author.reading_order + '</i>')
  75     end
  76
  77     seriesInfo = []
  78     series = @store.load_series(@series_id)
  79     if nil != series and nil != series.descr
  80       seriesInfo.push(series.descr.to_s)
  81     end
  82     if nil != @volume
  83       seriesInfo.push(@volume.to_s)
  84     end
  85     if seriesInfo.length > 0
  86       result.push(seriesInfo.join(' '))
  87     end
  88
  89     classification = nil
  90     if nil != @classification_id
  91       classification = @store.load_classification(@classification_id)
  92     end
  93     if nil != classification
  94       if nil != classification.ddc
  95         result.push('Dewey: ' + classification.ddc.to_s)
  96       end
  97       if nil != classification.lcc
  98         result.push('LCC: ' + classification.lcc.to_s)
  99       end
 100     end
 101
 102     return result.join('<br/>')
 103   end
 104
 105   def inspect
 106     data = []
 107     if nil != @author
 108       data.push('author="' + @author.inspect + '"')
 109     end
 110     if nil != @series_id
 111       data.push('series_id="' + @series_id.to_s() + '"')
 112     end
 113     if nil != @volume
 114       data.push('volume="' + @volume + '"')
 115     end
 116     if nil != @title
 117       data.push('title="' + @title + '"')
 118     end
 119     if nil != @cover
 120       data.push(@cover.inspect())
 121     end
 122     if nil != @path
 123       data.push('path="' + @path + '"')
 124     end
 125     return '(Book:' + data.join(',') + ')'
 126   end
 127
 128   def to_s
 129     return inspect()
 130   end
 131
 132   def title_grouping
 133     if nil == @path
 134       return nil
 135     end
 136
 137     return File.basename(@path, '.*')
 138   end
 139
 140   protected
 141   def isUpper?(c)
 142     return /[[:upper:]]/.match(c)
 143   end
 144
 145   protected
 146   def massage_author(input)
 147     if nil == input
 148       return nil
 149     end
 150
 151     reading_order = ""
 152     input.each_char do |c|
 153       if isUpper?(c) and (reading_order.length > 0)
 154         reading_order += " "
 155      end
 156       reading_order += c
 157     end
 158
 159     return reading_order
 160   end
 161
 162   # Returns (series, volumeNo, titleText)
 163   protected
 164   def processTitle(input)
 165     if nil == input
 166       return nil
 167     end
 168
 169     arr = input.split('_')
 170
 171     series = nil
 172     vol = nil
 173
 174     first = arr[0]
 175     matchData = (arr[0]).match(@@SERIES_AND_VOLUME_REGEX)
 176     if nil != matchData
 177       capt = matchData.captures
 178       series = capt[0]
 179       vol = capt[1]
 180       arr.shift
 181     end
 182
 183     pos = arr[-1].rindex('.')
 184     if nil != pos
 185       arr[-1] = arr[-1].slice(0, pos)
 186     end
 187
 188     title = arr.join(' ')
 189
 190     bare_title_grouping = title_grouping
 191       .split('_')
 192       .reject { |part| part.match(@@SERIES_AND_VOLUME_REGEX) }
 193       .join('_')
 194
 195     unless bare_title_grouping == Book.grouping_for_title(title)
 196       puts "WARNING:  title_grouping mismatch:  #{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}"
 197     end
 198
 199     return series, vol, title
 200   end
 201
 202   protected
 203   def parse_file_name!(file_name)
 204     category = nil   # e.g., non-fiction, fan-fiction
 205     grouping = ''
 206
 207     parts = file_name.split('/')
 208     (series_code, @volume, @title) = processTitle(parts[-1])
 209     if parts.length > 1
 210       grouping = parts[-2]
 211       reading_order = massage_author(grouping)
 212       sort_order = nil
 213       @author = Author.new(grouping, reading_order, sort_order)
 214       @series_id = @store.get_series(grouping, series_code)
 215     end
 216     if parts.length > 2
 217       category = parts[-3]
 218     end
 219
 220     lc_file_name = file_name.downcase
 221     if lc_file_name.end_with?(".epub")
 222       scanEpub!(file_name)
 223     elsif lc_file_name.end_with?(".pdf")
 224       scan_pdf!(file_name)
 225     end
 226
 227     @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*'))
 228
 229     # TODO:  Fix horrible hard-coded strings and paths
 230     if ('01_nonfic' == category) && (nil == classification_id)
 231       open(Store.unclassified_csv, 'a') do |fd|
 232         fd.puts('"' + grouping.to_s + '","' + path + '"')
 233       end
 234     end
 235   end
 236
 237   protected
 238   def scanEpub!(fileName)
 239     #puts 'Scanning "' + fileName.to_s + '"...'
 240     begin
 241       Zip.warn_invalid_date = false
 242       Zip::File.open(fileName) do |zipfile|
 243         entry = zipfile.find_entry('META-INF/container.xml')
 244         if nil == entry
 245           puts 'No META-INF/container.xml, skipping book ' + fileName
 246           return
 247         end
 248         contXml = zipfile.read('META-INF/container.xml')
 249         contDoc = Nokogiri::XML(contXml)
 250         opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path']
 251
 252         scanOpf!(zipfile, opfPath)
 253       end
 254     rescue Zip::Error => exc
 255       puts 'ERROR processing file "' + fileName + '":'
 256       puts exc.message
 257       puts exc.backtrace
 258     end
 259   end
 260
 261   protected
 262   def scan_pdf!(file_name)
 263     #puts 'Scanning "' + file_name.to_s + '"...'
 264
 265     pdf_path = File.expand_path(file_name).to_s
 266     if ! pdf_path.end_with?('.pdf')
 267       puts 'Unexpected internal error:  path "' + file_name.to_s + '" does not end with ".pdf".'
 268       return
 269     end
 270
 271     jpeg_path = pdf_path[0..-5] + '.jpeg'
 272     if File.file?(jpeg_path)
 273       File.open(jpeg_path, 'r') do |is|
 274         @cover = Cover.new(is, jpeg_path, 'image/jpeg')
 275       end
 276     end
 277   end
 278
 279
 280   protected
 281   def scanOpf!(zipfile, opfPath)
 282     coverId = nil
 283
 284     opfXml = zipfile.read(opfPath)
 285     opfDoc = Nokogiri::XML(opfXml)
 286
 287     #-------
 288     # Author
 289
 290     grouping = @author.grouping
 291     reading_order = @author.reading_order
 292     sort_order = @author.sort_order
 293
 294     creators = opfDoc.css('dc|creator', 'dc' => @@DC_NS_URL)
 295     if (creators.length > 0)
 296       creator = creators[0]
 297       if nil != creator
 298         role = creator['opf:role']
 299         if 'aut' == role
 300           reading_order = creator.content
 301
 302           file_as = creator['opf:file-as']
 303           if nil != file_as
 304             sort_order = file_as
 305           end
 306         end
 307
 308         @author = Author.new(grouping, reading_order, sort_order)
 309       end
 310     end
 311
 312     #---------------------------------------
 313     # Title
 314
 315     titles = opfDoc.css('dc|title', 'dc' => @@DC_NS_URL)
 316     if titles.length > 0
 317       title = titles[0]
 318       if nil != title
 319         @title = title.content
 320       end
 321     end
 322
 323     #---------------------------------------
 324     # Description
 325
 326     descrNodes = opfDoc.css('dc|description', 'dc' => @@DC_NS_URL)
 327     if (descrNodes.length > 0)
 328       descrNode = descrNodes[0]
 329       if nil != descrNode
 330         @description = descrNode.content
 331       end
 332     end
 333
 334     #---------------------------------------
 335     # Language
 336
 337     langNodes = opfDoc.css('dc|language', 'dc' => @@DC_NS_URL)
 338     if (langNodes.length > 0)
 339       langNode = langNodes[0]
 340       if langNode
 341         @language = langNode.content
 342       end
 343     end
 344
 345     #---------------------------------------
 346     # Other metadata:  series, volume, cover
 347
 348     metas = opfDoc.css('package metadata meta')
 349     for m in metas
 350       name = m['name']
 351       content = m['content']
 352
 353       if 'calibre:series' == name
 354         # TODO:  Dynamically create a new series?
 355         # @series_id = content
 356       elsif 'calibre:series-index' == name
 357         @volume = content
 358       elsif 'cover' == name
 359         coverId = content
 360         #puts 'File ' + @path + ' coverId ' + coverId
 361       end
 362     end
 363
 364     #---------------
 365     # Load the cover
 366
 367     @cover = load_cover(zipfile, opfPath, opfDoc, coverId)
 368   end
 369
 370   protected
 371   def load_cover(zipfile, opfPath, opfDoc, coverId)
 372     coverFile = nil
 373     if nil == coverId
 374       coverId = "cover-image"
 375     end
 376
 377     items = opfDoc.css('package manifest item')
 378     for i in items
 379       href = i['href']
 380       id = i['id']
 381       mimeType = i['media-type']
 382
 383       if coverId == id
 384         entry = zipfile.find_entry(href)
 385
 386         if nil == entry
 387           # Although the epub standard requires the path to be relative
 388           # to the base of the epub (zip), some books encountered in the
 389           # wild have been found to use a bath relative to the location
 390           # of the opf file.
 391           parts = opfPath.split('/')
 392           opfBasePath = opfPath.split('/')[0..-2].join('/')
 393           coverPath = opfBasePath + '/' + href
 394           entry = zipfile.find_entry(coverPath)
 395         end
 396
 397         unless entry
 398           # Another case found in the wild:  cover image is at the root, but path is '../cover.jpeg'
 399           if href.start_with? '../'
 400             coverPath = href[3..-1]
 401             entry = zipfile.find_entry(coverPath)
 402           end
 403         end
 404
 405         if nil == entry
 406           puts 'WARNING!  Cover image "' + href + '" not found in file "' + @path + '".'
 407           return nil
 408         else
 409           entry.get_input_stream() do |is|
 410             return Cover.new(is, href, mimeType)
 411           end
 412         end
 413       end
 414     end
 415     return nil
 416   end
 417 end
 418