book.rb

   1
   2 require 'nokogiri'
   3 require 'rubygems'
   4 require 'zip'
   5
   6 require_relative 'author'
   7 require_relative 'classification'
   8 require_relative 'cover'
   9 require_relative 'store'
  10
  11 class Book
  12   @@DC_NS_URL = 'http://purl.org/dc/elements/1.1/'
  13
  14   def initialize(store)
  15     @author = nil
  16     @classification_id = nil
  17     @cover = nil
  18     @description = nil
  19     @path = nil
  20     @series_id = nil
  21     @store = store
  22     @title = nil
  23     @volume = nil
  24   end
  25
  26   def load_from_file!(fileName)
  27     @path = fileName
  28     parse_file_name!(fileName)
  29   end
  30
  31   def self.can_handle?(fileName)
  32     if nil == fileName
  33       return false
  34     end
  35
  36     #puts "Filename:  " + fileName.to_s
  37     lowerName = fileName.downcase()
  38
  39     if lowerName.end_with?(".epub")
  40       return true
  41     end
  42
  43     if lowerName.end_with?(".pdf")
  44       return true
  45     end
  46
  47     return false
  48   end
  49
  50   def author
  51     return @author
  52   end
  53
  54   def author=(value)
  55     @author = value
  56   end
  57
  58   def classification_id
  59     @classification_id
  60   end
  61
  62   def classification_id=(value)
  63     @classification_id = value
  64   end
  65
  66   def cover
  67     return @cover
  68   end
  69
  70   def cover=(value)
  71     @cover = value
  72   end
  73
  74   def description
  75     @description
  76   end
  77
  78   def description=(value)
  79     @description = value
  80   end
  81
  82   def heading
  83     result = []
  84
  85     if nil != @title
  86       result.push('<b>' + @title + '</b>')
  87     else
  88       result.push('<i>(Unknown title)</i>')
  89     end
  90     if nil != @author
  91       result.push('<i>by ' + @author.reading_order + '</i>')
  92     end
  93
  94     seriesInfo = []
  95     series = @store.load_series(@series_id)
  96     if nil != series and nil != series.descr
  97       seriesInfo.push(series.descr.to_s)
  98     end
  99     if nil != @volume
 100       seriesInfo.push(@volume.to_s)
 101     end
 102     if seriesInfo.length > 0
 103       result.push(seriesInfo.join(' '))
 104     end
 105
 106     classification = nil
 107     if nil != @classification_id
 108       classification = @store.load_classification(@classification_id)
 109     end
 110     if nil != classification
 111       if nil != classification.ddc
 112         result.push('Dewey: ' + classification.ddc.to_s)
 113       end
 114       if nil != classification.lcc
 115         result.push('LCC: ' + classification.lcc.to_s)
 116       end
 117     end
 118
 119     return result.join('<br/>')
 120   end
 121
 122   def inspect
 123     data = []
 124     if nil != @author
 125       data.push('author="' + @author.inspect + '"')
 126     end
 127     if nil != @series_id
 128       data.push('series_id="' + @series_id.to_s() + '"')
 129     end
 130     if nil != @volume
 131       data.push('volume="' + @volume + '"')
 132     end
 133     if nil != @title
 134       data.push('title="' + @title + '"')
 135     end
 136     if nil != @cover
 137       data.push(@cover.inspect())
 138     end
 139     if nil != @path
 140       data.push('path="' + @path + '"')
 141     end
 142     return '(Book:' + data.join(',') + ')'
 143   end
 144
 145   def path
 146     @path
 147   end
 148
 149   def path=(value)
 150     @path = value
 151   end
 152
 153   def series_id
 154     @series_id
 155   end
 156
 157   def series_id=(value)
 158     @series_id = value
 159   end
 160
 161   def to_s
 162     return inspect()
 163   end
 164
 165   def title
 166     @title
 167   end
 168
 169   def title=(value)
 170     @title = value
 171   end
 172
 173   def title_grouping
 174     if nil == @path
 175       return nil
 176     end
 177
 178     return File.basename(@path, '.*')
 179   end
 180
 181   def volume
 182     @volume
 183   end
 184
 185   def volume=(value)
 186     @volume = value
 187   end
 188
 189   protected
 190   def isUpper?(c)
 191     return /[[:upper:]]/.match(c)
 192   end
 193
 194   protected
 195   def massage_author(input)
 196     if nil == input
 197       return nil
 198     end
 199
 200     reading_order = ""
 201     input.each_char do |c|
 202       if isUpper?(c) and (reading_order.length > 0)
 203         reading_order += " "
 204      end
 205       reading_order += c
 206     end
 207
 208     return reading_order
 209   end
 210
 211   # Returns (series, volumeNo, titleText)
 212   protected
 213   def processTitle(input)
 214     if nil == input
 215       return nil
 216     end
 217
 218     arr = input.split('_')
 219
 220     series = nil
 221     vol = nil
 222
 223     first = arr[0]
 224     matchData = (arr[0]).match(/^([A-Z]+)([0-9]+)$/)
 225     if nil != matchData
 226       capt = matchData.captures
 227       series = capt[0]
 228       vol = capt[1]
 229       arr.shift
 230     end
 231
 232     pos = arr[-1].rindex('.')
 233     if nil != pos
 234       arr[-1] = arr[-1].slice(0, pos)
 235     end
 236
 237     title = arr.join(' ')
 238
 239     return series, vol, title
 240   end
 241
 242   protected
 243   def parse_file_name!(file_name)
 244     category = nil   # e.g., non-fiction, fan-fiction
 245     grouping = ''
 246
 247     parts = file_name.split('/')
 248     (series_code, @volume, @title) = processTitle(parts[-1])
 249     if parts.length > 1
 250       grouping = parts[-2]
 251       reading_order = massage_author(grouping)
 252       sort_order = nil
 253       @author = Author.new(grouping, reading_order, sort_order)
 254       @series_id = @store.get_series(grouping, series_code)
 255     end
 256     if parts.length > 2
 257       category = parts[-3]
 258     end
 259
 260     lc_file_name = file_name.downcase
 261     if lc_file_name.end_with?(".epub")
 262       scanEpub!(file_name)
 263     elsif lc_file_name.end_with?(".pdf")
 264       scan_pdf!(file_name)
 265     end
 266
 267     @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*'))
 268
 269     # TODO:  Fix horrible hard-coded strings and paths
 270     if ('01_nonfic' == category) && (nil == classification_id)
 271       open(Store.unclassified_csv, 'a') do |fd|
 272         fd.puts('"' + grouping.to_s + '","' + path + '"')
 273       end
 274     end
 275   end
 276
 277   protected
 278   def scanEpub!(fileName)
 279     #puts 'Scanning "' + fileName.to_s + '"...'
 280     begin
 281       Zip.warn_invalid_date = false
 282       Zip::File.open(fileName) do |zipfile|
 283         entry = zipfile.find_entry('META-INF/container.xml')
 284         if nil == entry
 285           puts 'No META-INF/container.xml, skipping book ' + fileName
 286           return
 287         end
 288         contXml = zipfile.read('META-INF/container.xml')
 289         contDoc = Nokogiri::XML(contXml)
 290         opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path']
 291
 292         scanOpf!(zipfile, opfPath)
 293       end
 294     rescue Zip::Error => exc
 295       puts 'ERROR processing file "' + fileName + '":'
 296       puts exc.message
 297       puts exc.backtrace
 298     end
 299   end
 300
 301   protected
 302   def scan_pdf!(file_name)
 303     #puts 'Scanning "' + file_name.to_s + '"...'
 304
 305     pdf_path = File.expand_path(file_name).to_s
 306     if ! pdf_path.end_with?('.pdf')
 307       puts 'Unexpected internal error:  path "' + file_name.to_s + '" does not end with ".pdf".'
 308       return
 309     end
 310
 311     jpeg_path = pdf_path[0..-5] + '.jpeg'
 312     if File.file?(jpeg_path)
 313       File.open(jpeg_path, 'r') do |is|
 314         @cover = Cover.new(is, jpeg_path, 'image/jpeg')
 315       end
 316     end
 317   end
 318
 319
 320   protected
 321   def scanOpf!(zipfile, opfPath)
 322     coverId = nil
 323
 324     opfXml = zipfile.read(opfPath)
 325     opfDoc = Nokogiri::XML(opfXml)
 326
 327     #-------
 328     # Author
 329
 330     grouping = @author.grouping
 331     reading_order = @author.reading_order
 332     sort_order = @author.sort_order
 333
 334     creators = opfDoc.css('dc|creator', 'dc' => @@DC_NS_URL)
 335     if (creators.length > 0)
 336       creator = creators[0]
 337       if nil != creator
 338         role = creator['opf:role']
 339         if 'aut' == role
 340           reading_order = creator.content
 341
 342           file_as = creator['opf:file-as']
 343           if nil != file_as
 344             sort_order = file_as
 345           end
 346         end
 347
 348         @author = Author.new(grouping, reading_order, sort_order)
 349       end
 350     end
 351
 352     #---------------------------------------
 353     # Title
 354
 355     titles = opfDoc.css('dc|title', 'dc' => @@DC_NS_URL)
 356     if titles.length > 0
 357       title = titles[0]
 358       if nil != title
 359         @title = title.content
 360       end
 361     end
 362
 363     #---------------------------------------
 364     # Description
 365
 366     descrNodes = opfDoc.css('dc|description', 'dc' => @@DC_NS_URL)
 367     if (descrNodes.length > 0)
 368       descrNode = descrNodes[0]
 369       if nil != descrNode
 370         @description = descrNode.content
 371       end
 372     end
 373
 374     #---------------------------------------
 375     # Other metadata:  series, volume, cover
 376
 377     metas = opfDoc.css('package metadata meta')
 378     for m in metas
 379       name = m['name']
 380       content = m['content']
 381
 382       if 'calibre:series' == name
 383         # TODO:  Dynamically create a new series?
 384         # @series_id = content
 385       elsif 'calibre:series-index' == name
 386         @volume = content
 387       elsif 'cover' == name
 388         coverId = content
 389         #puts 'File ' + @path + ' coverId ' + coverId
 390       end
 391     end
 392
 393     #---------------
 394     # Load the cover
 395
 396     @cover = load_cover(zipfile, opfPath, opfDoc, coverId)
 397   end
 398
 399   protected
 400   def load_cover(zipfile, opfPath, opfDoc, coverId)
 401     coverFile = nil
 402     if nil == coverId
 403       coverId = "cover-image"
 404     end
 405
 406     items = opfDoc.css('package manifest item')
 407     for i in items
 408       href = i['href']
 409       id = i['id']
 410       mimeType = i['media-type']
 411
 412       if coverId == id
 413         entry = zipfile.find_entry(href)
 414
 415         if nil == entry
 416           # Although the epub standard requires the path to be relative
 417           # to the base of the epub (zip), some books encountered in the
 418           # wild have been found to use a bath relative to the location
 419           # of the opf file.
 420           parts = opfPath.split('/')
 421           opfBasePath = opfPath.split('/')[0..-2].join('/')
 422           coverPath = opfBasePath + '/' + href
 423           entry = zipfile.find_entry(coverPath)
 424         end
 425
 426         unless entry
 427           # Another case found in the wild:  cover image is at the root, but path is '../cover.jpeg'
 428           if href.start_with? '../'
 429             coverPath = href[3..-1]
 430             entry = zipfile.find_entry(coverPath)
 431           end
 432         end
 433
 434         if nil == entry
 435           puts 'WARNING!  Cover image "' + href + '" not found in file "' + @path + '".'
 436           return nil
 437         else
 438           entry.get_input_stream() do |is|
 439             return Cover.new(is, href, mimeType)
 440           end
 441         end
 442       end
 443     end
 444     return nil
 445   end
 446 end
 447