book.rb

   1
   2 require 'nokogiri'
   3 require 'zip'
   4
   5 require 'author'
   6 require 'classification'
   7 require 'cover'
   8 require 'store'
   9
  10 class Book
  11   @@DC_NS_URL = 'http://purl.org/dc/elements/1.1/'
  12
  13   def initialize(store)
  14     @author = nil
  15     @classification_id = nil
  16     @cover = nil
  17     @description = nil
  18     @path = nil
  19     @series_id = nil
  20     @store = store
  21     @title = nil
  22     @volume = nil
  23   end
  24
  25   def load_from_file!(fileName)
  26     @path = fileName
  27     parse_file_name!(fileName)
  28   end
  29
  30   def self.can_handle?(fileName)
  31     if nil == fileName
  32       return false
  33     end
  34
  35     #puts "Filename:  " + fileName.to_s
  36     lowerName = fileName.downcase()
  37
  38     if lowerName.end_with?(".epub")
  39       return true
  40     end
  41
  42     if lowerName.end_with?(".pdf")
  43       return true
  44     end
  45
  46     return false
  47   end
  48
  49   def author
  50     return @author
  51   end
  52
  53   def author=(value)
  54     @author = value
  55   end
  56
  57   def classification_id
  58     @classification_id
  59   end
  60
  61   def classification_id=(value)
  62     @classification_id = value
  63   end
  64
  65   def cover
  66     return @cover
  67   end
  68
  69   def cover=(value)
  70     @cover = value
  71   end
  72
  73   def description
  74     @description
  75   end
  76
  77   def description=(value)
  78     @description = value
  79   end
  80
  81   def heading
  82     result = []
  83
  84     if nil != @title
  85       result.push('<b>' + @title + '</b>')
  86     else
  87       result.push('<i>(Unknown title)</i>')
  88     end
  89     if nil != @author
  90       result.push('<i>by ' + @author.reading_order + '</i>')
  91     end
  92
  93     seriesInfo = []
  94     series = @store.load_series(@series_id)
  95     if nil != series and nil != series.descr
  96       seriesInfo.push(series.descr.to_s)
  97     end
  98     if nil != @volume
  99       seriesInfo.push(@volume.to_s)
 100     end
 101     if seriesInfo.length > 0
 102       result.push(seriesInfo.join(' '))
 103     end
 104
 105     classification = nil
 106     if nil != @classification_id
 107       classification = @store.load_classification(@classification_id)
 108     end
 109     if nil != classification
 110       if nil != classification.ddc
 111         result.push('Dewey: ' + classification.ddc.to_s)
 112       end
 113       if nil != classification.lcc
 114         result.push('LCC: ' + classification.lcc.to_s)
 115       end
 116     end
 117
 118     return result.join('<br/>')
 119   end
 120
 121   def inspect
 122     data = []
 123     if nil != @author
 124       data.push('author="' + @author.inspect + '"')
 125     end
 126     if nil != @series_id
 127       data.push('series_id="' + @series_id.to_s() + '"')
 128     end
 129     if nil != @volume
 130       data.push('volume="' + @volume + '"')
 131     end
 132     if nil != @title
 133       data.push('title="' + @title + '"')
 134     end
 135     if nil != @cover
 136       data.push(@cover.inspect())
 137     end
 138     if nil != @path
 139       data.push('path="' + @path + '"')
 140     end
 141     return '(Book:' + data.join(',') + ')'
 142   end
 143
 144   def path
 145     @path
 146   end
 147
 148   def path=(value)
 149     @path = value
 150   end
 151
 152   def series_id
 153     @series_id
 154   end
 155
 156   def series_id=(value)
 157     @series_id = value
 158   end
 159
 160   def to_s
 161     return inspect()
 162   end
 163
 164   def title
 165     @title
 166   end
 167
 168   def title=(value)
 169     @title = value
 170   end
 171
 172   def title_grouping
 173     if nil == @path
 174       return nil
 175     end
 176
 177     return File.basename(@path, '.*')
 178   end
 179
 180   def volume
 181     @volume
 182   end
 183
 184   def volume=(value)
 185     @volume = value
 186   end
 187
 188   protected
 189   def isUpper?(c)
 190     return /[[:upper:]]/.match(c)
 191   end
 192
 193   protected
 194   def massage_author(input)
 195     if nil == input
 196       return nil
 197     end
 198
 199     reading_order = ""
 200     input.each_char do |c|
 201       if isUpper?(c) and (reading_order.length > 0)
 202         reading_order += " "
 203      end
 204       reading_order += c
 205     end
 206
 207     return reading_order
 208   end
 209
 210   # Returns (series, volumeNo, titleText)
 211   protected
 212   def processTitle(input)
 213     if nil == input
 214       return nil
 215     end
 216
 217     arr = input.split('_')
 218
 219     series = nil
 220     vol = nil
 221
 222     first = arr[0]
 223     matchData = (arr[0]).match(/^([A-Z]+)([0-9]+)$/)
 224     if nil != matchData
 225       capt = matchData.captures
 226       series = capt[0]
 227       vol = capt[1]
 228       arr.shift
 229     end
 230
 231     pos = arr[-1].rindex('.')
 232     if nil != pos
 233       arr[-1] = arr[-1].slice(0, pos)
 234     end
 235
 236     title = arr.join(' ')
 237
 238     return series, vol, title
 239   end
 240
 241   protected
 242   def parse_file_name!(file_name)
 243     category = nil   # e.g., non-fiction, fan-fiction
 244     grouping = ''
 245
 246     parts = file_name.split('/')
 247     (series_code, @volume, @title) = processTitle(parts[-1])
 248     if parts.length > 1
 249       grouping = parts[-2]
 250       reading_order = massage_author(grouping)
 251       sort_order = nil
 252       @author = Author.new(grouping, reading_order, sort_order)
 253       @series_id = @store.get_series(grouping, series_code)
 254     end
 255     if parts.length > 2
 256       category = parts[-3]
 257     end
 258
 259     lc_file_name = file_name.downcase
 260     if lc_file_name.end_with?(".epub")
 261       scanEpub!(file_name)
 262     elsif lc_file_name.end_with?(".pdf")
 263       scan_pdf!(file_name)
 264     end
 265
 266     @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*'))
 267
 268     # TODO:  Fix horrible hard-coded strings and paths
 269     if ('01_nonfic' == category) && (nil == classification_id)
 270       open(Store.unclassified_csv, 'a') do |fd|
 271         fd.puts('"' + grouping.to_s + '","' + path + '"')
 272       end
 273     end
 274   end
 275
 276   protected
 277   def scanEpub!(fileName)
 278     #puts 'Scanning "' + fileName.to_s + '"...'
 279     begin
 280       Zip::File.open(fileName) do |zipfile|
 281         entry = zipfile.find_entry('META-INF/container.xml')
 282         if nil == entry
 283           puts 'No META-INF/container.xml, skipping book ' + fileName
 284           return
 285         end
 286         contXml = zipfile.read('META-INF/container.xml')
 287         contDoc = Nokogiri::XML(contXml)
 288         opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path']
 289
 290         scanOpf!(zipfile, opfPath)
 291       end
 292     rescue Zip::Error => exc
 293       puts 'ERROR processing file "' + fileName + '":'
 294       puts exc.message
 295       puts exc.backtrace
 296     end
 297   end
 298
 299   protected
 300   def scan_pdf!(file_name)
 301     #puts 'Scanning "' + file_name.to_s + '"...'
 302
 303     pdf_path = File.expand_path(file_name).to_s
 304     if ! pdf_path.end_with?('.pdf')
 305       puts 'Unexpected internal error:  path "' + file_name.to_s + '" does not end with ".pdf".'
 306       return
 307     end
 308
 309     jpeg_path = pdf_path[0..-5] + '.jpeg'
 310     if File.file?(jpeg_path)
 311       File.open(jpeg_path, 'r') do |is|
 312         @cover = Cover.new(is, jpeg_path, 'image/jpeg')
 313       end
 314     end
 315   end
 316
 317
 318   protected
 319   def scanOpf!(zipfile, opfPath)
 320     coverId = nil
 321
 322     opfXml = zipfile.read(opfPath)
 323     opfDoc = Nokogiri::XML(opfXml)
 324
 325     #-------
 326     # Author
 327
 328     grouping = @author.grouping
 329     reading_order = @author.reading_order
 330     sort_order = @author.sort_order
 331
 332     creators = opfDoc.css('dc|creator', 'dc' => @@DC_NS_URL)
 333     if (creators.length > 0)
 334       creator = creators[0]
 335       if nil != creator
 336         role = creator['opf:role']
 337         if 'aut' == role
 338           reading_order = creator.content
 339
 340           file_as = creator['opf:file-as']
 341           if nil != file_as
 342             sort_order = file_as
 343           end
 344         end
 345
 346         @author = Author.new(grouping, reading_order, sort_order)
 347       end
 348     end
 349
 350     #---------------------------------------
 351     # Title
 352
 353     titles = opfDoc.css('dc|title', 'dc' => @@DC_NS_URL)
 354     if titles.length > 0
 355       title = titles[0]
 356       if nil != title
 357         @title = title.content
 358       end
 359     end
 360
 361     #---------------------------------------
 362     # Description
 363
 364     descrNodes = opfDoc.css('dc|description', 'dc' => @@DC_NS_URL)
 365     if (descrNodes.length > 0)
 366       descrNode = descrNodes[0]
 367       if nil != descrNode
 368         @description = descrNode.content
 369       end
 370     end
 371
 372     #---------------------------------------
 373     # Other metadata:  series, volume, cover
 374
 375     metas = opfDoc.css('package metadata meta')
 376     for m in metas
 377       name = m['name']
 378       content = m['content']
 379
 380       if 'calibre:series' == name
 381         # TODO:  Dynamically create a new series?
 382         # @series_id = content
 383       elsif 'calibre:series-index' == name
 384         @volume = content
 385       elsif 'cover' == name
 386         coverId = content
 387         #puts 'File ' + @path + ' coverId ' + coverId
 388       end
 389     end
 390
 391     #---------------
 392     # Load the cover
 393
 394     @cover = load_cover(zipfile, opfPath, opfDoc, coverId)
 395   end
 396
 397   protected
 398   def load_cover(zipfile, opfPath, opfDoc, coverId)
 399     coverFile = nil
 400     if nil == coverId
 401       coverId = "cover-image"
 402     end
 403
 404     items = opfDoc.css('package manifest item')
 405     for i in items
 406       href = i['href']
 407       id = i['id']
 408       mimeType = i['media-type']
 409
 410       if coverId == id
 411         entry = zipfile.find_entry(href)
 412
 413         if nil == entry
 414           # Although the epub standard requires the path to be relative
 415           # to the base of the epub (zip), some books encountered in the
 416           # wild have been found to use a bath relative to the location
 417           # of the opf file.
 418           parts = opfPath.split('/')
 419           opfBasePath = opfPath.split('/')[0..-2].join('/')
 420           coverPath = opfBasePath + '/' + href
 421           entry = zipfile.find_entry(coverPath)
 422         end
 423
 424         if nil == entry
 425           puts 'WARNING!  Cover image "' + href + '" not found in file "' + @path + '".'
 426           return nil
 427         else
 428           entry.get_input_stream() do |is|
 429             return Cover.new(is, href, mimeType)
 430           end
 431         end
 432       end
 433     end
 434     return nil
 435   end
 436 end
 437