book.rb

   1
   2 require 'nokogiri'
   3 require 'rubygems'
   4 require 'zip'
   5
   6 require_relative 'author'
   7 require_relative 'classification'
   8 require_relative 'cover'
   9 require_relative 'store'
  10
  11 class Book
  12   @@DC_NS_URL = 'http://purl.org/dc/elements/1.1/'
  13   @@SERIES_AND_VOLUME_REGEX = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/
  14
  15   attr_accessor :arrived
  16   attr_accessor :author
  17   attr_accessor :classification_id
  18   attr_accessor :cover
  19   attr_accessor :description
  20   attr_accessor :language
  21   attr_accessor :path
  22   attr_accessor :series_id
  23   attr_accessor :title
  24   attr_accessor :volume
  25
  26   def initialize(store)
  27     @store = store
  28   end
  29
  30   def load_from_file!(fileName)
  31     @path = fileName
  32     parse_file_name!(fileName)
  33   end
  34
  35   def self.can_handle?(fileName)
  36     if nil == fileName
  37       return false
  38     end
  39
  40     #puts "Filename:  " + fileName.to_s
  41     lowerName = fileName.downcase()
  42
  43     if lowerName.end_with?(".epub")
  44       return true
  45     end
  46
  47     if lowerName.end_with?(".pdf")
  48       return true
  49     end
  50
  51     return false
  52   end
  53
  54   def self.grouping_for_title(title)
  55     result = title
  56
  57     '\'",!#'.split('').each do |c|
  58       result = result.gsub(c, '-')
  59     end
  60     result = result.gsub(/: */, '--')
  61     result = result.gsub(' ', '_')
  62
  63     result
  64   end
  65
  66   def heading
  67     result = []
  68
  69     if nil != @title
  70       result.push('<b>' + @title + '</b>')
  71     else
  72       result.push('<i>(Unknown title)</i>')
  73     end
  74     if nil != @author
  75       result.push('<i>by ' + @author.reading_order + '</i>')
  76     end
  77
  78     seriesInfo = []
  79     series = @store.load_series(@series_id)
  80     if nil != series and nil != series.descr
  81       seriesInfo.push(series.descr.to_s)
  82     end
  83     if nil != @volume
  84       seriesInfo.push(@volume.to_s)
  85     end
  86     if seriesInfo.length > 0
  87       result.push(seriesInfo.join(' '))
  88     end
  89
  90     classification = nil
  91     if nil != @classification_id
  92       classification = @store.load_classification(@classification_id)
  93     end
  94     if nil != classification
  95       if nil != classification.ddc
  96         result.push('Dewey: ' + classification.ddc.to_s)
  97       end
  98       if nil != classification.lcc
  99         result.push('LCC: ' + classification.lcc.to_s)
 100       end
 101     end
 102
 103     return result.join('<br/>')
 104   end
 105
 106   def inspect
 107     data = []
 108     if nil != @author
 109       data.push('author="' + @author.inspect + '"')
 110     end
 111     if nil != @series_id
 112       data.push('series_id="' + @series_id.to_s() + '"')
 113     end
 114     if nil != @volume
 115       data.push('volume="' + @volume + '"')
 116     end
 117     if nil != @title
 118       data.push('title="' + @title + '"')
 119     end
 120     if nil != @cover
 121       data.push(@cover.inspect())
 122     end
 123     if nil != @path
 124       data.push('path="' + @path + '"')
 125     end
 126     return '(Book:' + data.join(',') + ')'
 127   end
 128
 129   def to_s
 130     return inspect()
 131   end
 132
 133   def title_grouping
 134     if nil == @path
 135       return nil
 136     end
 137
 138     return File.basename(@path, '.*')
 139   end
 140
 141   protected
 142   def isUpper?(c)
 143     return /[[:upper:]]/.match(c)
 144   end
 145
 146   protected
 147   def massage_author(input)
 148     if nil == input
 149       return nil
 150     end
 151
 152     reading_order = ""
 153     input.each_char do |c|
 154       if isUpper?(c) and (reading_order.length > 0)
 155         reading_order += " "
 156      end
 157       reading_order += c
 158     end
 159
 160     return reading_order
 161   end
 162
 163   # Returns (series, volumeNo, titleText)
 164   protected
 165   def processTitle(input)
 166     if nil == input
 167       return nil
 168     end
 169
 170     arr = input.split('_')
 171
 172     series = nil
 173     vol = nil
 174
 175     first = arr[0]
 176     matchData = (arr[0]).match(@@SERIES_AND_VOLUME_REGEX)
 177     if nil != matchData
 178       capt = matchData.captures
 179       series = capt[0]
 180       vol = capt[1]
 181       arr.shift
 182     end
 183
 184     pos = arr[-1].rindex('.')
 185     if nil != pos
 186       arr[-1] = arr[-1].slice(0, pos)
 187     end
 188
 189     title = arr.join(' ')
 190
 191     bare_title_grouping = title_grouping
 192       .split('_')
 193       .reject { |part| part.match(@@SERIES_AND_VOLUME_REGEX) }
 194       .join('_')
 195
 196     unless bare_title_grouping == Book.grouping_for_title(title)
 197       puts "WARNING:  title_grouping mismatch:  #{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}"
 198     end
 199
 200     return series, vol, title
 201   end
 202
 203   protected
 204   def parse_file_name!(file_name)
 205     category = nil   # e.g., non-fiction, fan-fiction
 206     grouping = ''
 207
 208     parts = file_name.split('/')
 209     (series_code, @volume, @title) = processTitle(parts[-1])
 210     if parts.length > 1
 211       grouping = parts[-2]
 212       reading_order = massage_author(grouping)
 213       sort_order = nil
 214       @author = Author.new(grouping, reading_order, sort_order)
 215       @series_id = @store.get_series(grouping, series_code)
 216     end
 217     if parts.length > 2
 218       category = parts[-3]
 219     end
 220
 221     lc_file_name = file_name.downcase
 222     if lc_file_name.end_with?(".epub")
 223       scanEpub!(file_name)
 224     elsif lc_file_name.end_with?(".pdf")
 225       scan_pdf!(file_name)
 226     end
 227
 228     @arrived = File.ctime(file_name)
 229
 230     @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*'))
 231
 232     # TODO:  Fix horrible hard-coded strings and paths
 233     if ('01_nonfic' == category) && (nil == classification_id)
 234       open(Store.unclassified_csv, 'a') do |fd|
 235         fd.puts('"' + grouping.to_s + '","' + path + '"')
 236       end
 237     end
 238   end
 239
 240   protected
 241   def scanEpub!(fileName)
 242     #puts 'Scanning "' + fileName.to_s + '"...'
 243     begin
 244       Zip.warn_invalid_date = false
 245       Zip::File.open(fileName) do |zipfile|
 246         entry = zipfile.find_entry('META-INF/container.xml')
 247         if nil == entry
 248           puts 'No META-INF/container.xml, skipping book ' + fileName
 249           return
 250         end
 251         contXml = zipfile.read('META-INF/container.xml')
 252         contDoc = Nokogiri::XML(contXml)
 253         opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path']
 254
 255         scanOpf!(zipfile, opfPath)
 256       end
 257     rescue Zip::Error => exc
 258       puts 'ERROR processing file "' + fileName + '":'
 259       puts exc.message
 260       puts exc.backtrace
 261     end
 262   end
 263
 264   protected
 265   def scan_pdf!(file_name)
 266     #puts 'Scanning "' + file_name.to_s + '"...'
 267
 268     pdf_path = File.expand_path(file_name).to_s
 269     if ! pdf_path.end_with?('.pdf')
 270       puts 'Unexpected internal error:  path "' + file_name.to_s + '" does not end with ".pdf".'
 271       return
 272     end
 273
 274     jpeg_path = pdf_path[0..-5] + '.jpeg'
 275     if File.file?(jpeg_path)
 276       File.open(jpeg_path, 'r') do |is|
 277         @cover = Cover.new(is, jpeg_path, 'image/jpeg')
 278       end
 279     end
 280   end
 281
 282
 283   protected
 284   def scanOpf!(zipfile, opfPath)
 285     coverId = nil
 286
 287     opfXml = zipfile.read(opfPath)
 288     opfDoc = Nokogiri::XML(opfXml)
 289
 290     #-------
 291     # Author
 292
 293     grouping = @author.grouping
 294     reading_order = @author.reading_order
 295     sort_order = @author.sort_order
 296
 297     creators = opfDoc.css('dc|creator', 'dc' => @@DC_NS_URL)
 298     if (creators.length > 0)
 299       creator = creators[0]
 300       if nil != creator
 301         role = creator['opf:role']
 302         if 'aut' == role
 303           reading_order = creator.content
 304
 305           file_as = creator['opf:file-as']
 306           if nil != file_as
 307             sort_order = file_as
 308           end
 309         end
 310
 311         @author = Author.new(grouping, reading_order, sort_order)
 312       end
 313     end
 314
 315     #---------------------------------------
 316     # Title
 317
 318     titles = opfDoc.css('dc|title', 'dc' => @@DC_NS_URL)
 319     if titles.length > 0
 320       title = titles[0]
 321       if nil != title
 322         @title = title.content
 323       end
 324     end
 325
 326     #---------------------------------------
 327     # Description
 328
 329     descrNodes = opfDoc.css('dc|description', 'dc' => @@DC_NS_URL)
 330     if (descrNodes.length > 0)
 331       descrNode = descrNodes[0]
 332       if nil != descrNode
 333         @description = descrNode.content
 334       end
 335     end
 336
 337     #---------------------------------------
 338     # Language
 339
 340     langNodes = opfDoc.css('dc|language', 'dc' => @@DC_NS_URL)
 341     if (langNodes.length > 0)
 342       langNode = langNodes[0]
 343       if langNode
 344         @language = langNode.content
 345       end
 346     end
 347
 348     #---------------------------------------
 349     # Other metadata:  series, volume, cover
 350
 351     metas = opfDoc.css('package metadata meta')
 352     for m in metas
 353       name = m['name']
 354       content = m['content']
 355
 356       if 'calibre:series' == name
 357         # TODO:  Dynamically create a new series?
 358         # @series_id = content
 359       elsif 'calibre:series-index' == name
 360         @volume = content
 361       elsif 'cover' == name
 362         coverId = content
 363         #puts 'File ' + @path + ' coverId ' + coverId
 364       end
 365     end
 366
 367     #---------------
 368     # Load the cover
 369
 370     @cover = load_cover(zipfile, opfPath, opfDoc, coverId)
 371   end
 372
 373   protected
 374   def load_cover(zipfile, opfPath, opfDoc, coverId)
 375     coverFile = nil
 376     if nil == coverId
 377       coverId = "cover-image"
 378     end
 379
 380     items = opfDoc.css('package manifest item')
 381     for i in items
 382       href = i['href']
 383       id = i['id']
 384       mimeType = i['media-type']
 385
 386       if coverId == id
 387         entry = zipfile.find_entry(href)
 388
 389         if nil == entry
 390           # Although the epub standard requires the path to be relative
 391           # to the base of the epub (zip), some books encountered in the
 392           # wild have been found to use a bath relative to the location
 393           # of the opf file.
 394           parts = opfPath.split('/')
 395           opfBasePath = opfPath.split('/')[0..-2].join('/')
 396           coverPath = opfBasePath + '/' + href
 397           entry = zipfile.find_entry(coverPath)
 398         end
 399
 400         unless entry
 401           # Another case found in the wild:  cover image is at the root, but path is '../cover.jpeg'
 402           if href.start_with? '../'
 403             coverPath = href[3..-1]
 404             entry = zipfile.find_entry(coverPath)
 405           end
 406         end
 407
 408         if nil == entry
 409           puts 'WARNING!  Cover image "' + href + '" not found in file "' + @path + '".'
 410           return nil
 411         else
 412           entry.get_input_stream() do |is|
 413             return Cover.new(is, href, mimeType)
 414           end
 415         end
 416       end
 417     end
 418     return nil
 419   end
 420 end
 421