book.rb

   1
   2 require 'nokogiri'
   3 require 'rubygems'
   4 require 'zip'
   5
   6 require_relative 'author'
   7 require_relative 'classification'
   8 require_relative 'cover'
   9 require_relative 'store'
  10
  11 class Book
  12   @@DC_NS_URL = 'http://purl.org/dc/elements/1.1/'
  13
  14   attr_accessor :author
  15   attr_accessor :classification_id
  16   attr_accessor :cover
  17   attr_accessor :description
  18   attr_accessor :path
  19   attr_accessor :series_id
  20   attr_accessor :title
  21   attr_accessor :volume
  22
  23   def initialize(store)
  24     @store = store
  25   end
  26
  27   def load_from_file!(fileName)
  28     @path = fileName
  29     parse_file_name!(fileName)
  30   end
  31
  32   def self.can_handle?(fileName)
  33     if nil == fileName
  34       return false
  35     end
  36
  37     #puts "Filename:  " + fileName.to_s
  38     lowerName = fileName.downcase()
  39
  40     if lowerName.end_with?(".epub")
  41       return true
  42     end
  43
  44     if lowerName.end_with?(".pdf")
  45       return true
  46     end
  47
  48     return false
  49   end
  50
  51   def heading
  52     result = []
  53
  54     if nil != @title
  55       result.push('<b>' + @title + '</b>')
  56     else
  57       result.push('<i>(Unknown title)</i>')
  58     end
  59     if nil != @author
  60       result.push('<i>by ' + @author.reading_order + '</i>')
  61     end
  62
  63     seriesInfo = []
  64     series = @store.load_series(@series_id)
  65     if nil != series and nil != series.descr
  66       seriesInfo.push(series.descr.to_s)
  67     end
  68     if nil != @volume
  69       seriesInfo.push(@volume.to_s)
  70     end
  71     if seriesInfo.length > 0
  72       result.push(seriesInfo.join(' '))
  73     end
  74
  75     classification = nil
  76     if nil != @classification_id
  77       classification = @store.load_classification(@classification_id)
  78     end
  79     if nil != classification
  80       if nil != classification.ddc
  81         result.push('Dewey: ' + classification.ddc.to_s)
  82       end
  83       if nil != classification.lcc
  84         result.push('LCC: ' + classification.lcc.to_s)
  85       end
  86     end
  87
  88     return result.join('<br/>')
  89   end
  90
  91   def inspect
  92     data = []
  93     if nil != @author
  94       data.push('author="' + @author.inspect + '"')
  95     end
  96     if nil != @series_id
  97       data.push('series_id="' + @series_id.to_s() + '"')
  98     end
  99     if nil != @volume
 100       data.push('volume="' + @volume + '"')
 101     end
 102     if nil != @title
 103       data.push('title="' + @title + '"')
 104     end
 105     if nil != @cover
 106       data.push(@cover.inspect())
 107     end
 108     if nil != @path
 109       data.push('path="' + @path + '"')
 110     end
 111     return '(Book:' + data.join(',') + ')'
 112   end
 113
 114   def to_s
 115     return inspect()
 116   end
 117
 118   def title_grouping
 119     if nil == @path
 120       return nil
 121     end
 122
 123     return File.basename(@path, '.*')
 124   end
 125
 126   protected
 127   def isUpper?(c)
 128     return /[[:upper:]]/.match(c)
 129   end
 130
 131   protected
 132   def massage_author(input)
 133     if nil == input
 134       return nil
 135     end
 136
 137     reading_order = ""
 138     input.each_char do |c|
 139       if isUpper?(c) and (reading_order.length > 0)
 140         reading_order += " "
 141      end
 142       reading_order += c
 143     end
 144
 145     return reading_order
 146   end
 147
 148   # Returns (series, volumeNo, titleText)
 149   protected
 150   def processTitle(input)
 151     if nil == input
 152       return nil
 153     end
 154
 155     arr = input.split('_')
 156
 157     series = nil
 158     vol = nil
 159
 160     first = arr[0]
 161     matchData = (arr[0]).match(/^([A-Z]+)([0-9]+)$/)
 162     if nil != matchData
 163       capt = matchData.captures
 164       series = capt[0]
 165       vol = capt[1]
 166       arr.shift
 167     end
 168
 169     pos = arr[-1].rindex('.')
 170     if nil != pos
 171       arr[-1] = arr[-1].slice(0, pos)
 172     end
 173
 174     title = arr.join(' ')
 175
 176     return series, vol, title
 177   end
 178
 179   protected
 180   def parse_file_name!(file_name)
 181     category = nil   # e.g., non-fiction, fan-fiction
 182     grouping = ''
 183
 184     parts = file_name.split('/')
 185     (series_code, @volume, @title) = processTitle(parts[-1])
 186     if parts.length > 1
 187       grouping = parts[-2]
 188       reading_order = massage_author(grouping)
 189       sort_order = nil
 190       @author = Author.new(grouping, reading_order, sort_order)
 191       @series_id = @store.get_series(grouping, series_code)
 192     end
 193     if parts.length > 2
 194       category = parts[-3]
 195     end
 196
 197     lc_file_name = file_name.downcase
 198     if lc_file_name.end_with?(".epub")
 199       scanEpub!(file_name)
 200     elsif lc_file_name.end_with?(".pdf")
 201       scan_pdf!(file_name)
 202     end
 203
 204     @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*'))
 205
 206     # TODO:  Fix horrible hard-coded strings and paths
 207     if ('01_nonfic' == category) && (nil == classification_id)
 208       open(Store.unclassified_csv, 'a') do |fd|
 209         fd.puts('"' + grouping.to_s + '","' + path + '"')
 210       end
 211     end
 212   end
 213
 214   protected
 215   def scanEpub!(fileName)
 216     #puts 'Scanning "' + fileName.to_s + '"...'
 217     begin
 218       Zip.warn_invalid_date = false
 219       Zip::File.open(fileName) do |zipfile|
 220         entry = zipfile.find_entry('META-INF/container.xml')
 221         if nil == entry
 222           puts 'No META-INF/container.xml, skipping book ' + fileName
 223           return
 224         end
 225         contXml = zipfile.read('META-INF/container.xml')
 226         contDoc = Nokogiri::XML(contXml)
 227         opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path']
 228
 229         scanOpf!(zipfile, opfPath)
 230       end
 231     rescue Zip::Error => exc
 232       puts 'ERROR processing file "' + fileName + '":'
 233       puts exc.message
 234       puts exc.backtrace
 235     end
 236   end
 237
 238   protected
 239   def scan_pdf!(file_name)
 240     #puts 'Scanning "' + file_name.to_s + '"...'
 241
 242     pdf_path = File.expand_path(file_name).to_s
 243     if ! pdf_path.end_with?('.pdf')
 244       puts 'Unexpected internal error:  path "' + file_name.to_s + '" does not end with ".pdf".'
 245       return
 246     end
 247
 248     jpeg_path = pdf_path[0..-5] + '.jpeg'
 249     if File.file?(jpeg_path)
 250       File.open(jpeg_path, 'r') do |is|
 251         @cover = Cover.new(is, jpeg_path, 'image/jpeg')
 252       end
 253     end
 254   end
 255
 256
 257   protected
 258   def scanOpf!(zipfile, opfPath)
 259     coverId = nil
 260
 261     opfXml = zipfile.read(opfPath)
 262     opfDoc = Nokogiri::XML(opfXml)
 263
 264     #-------
 265     # Author
 266
 267     grouping = @author.grouping
 268     reading_order = @author.reading_order
 269     sort_order = @author.sort_order
 270
 271     creators = opfDoc.css('dc|creator', 'dc' => @@DC_NS_URL)
 272     if (creators.length > 0)
 273       creator = creators[0]
 274       if nil != creator
 275         role = creator['opf:role']
 276         if 'aut' == role
 277           reading_order = creator.content
 278
 279           file_as = creator['opf:file-as']
 280           if nil != file_as
 281             sort_order = file_as
 282           end
 283         end
 284
 285         @author = Author.new(grouping, reading_order, sort_order)
 286       end
 287     end
 288
 289     #---------------------------------------
 290     # Title
 291
 292     titles = opfDoc.css('dc|title', 'dc' => @@DC_NS_URL)
 293     if titles.length > 0
 294       title = titles[0]
 295       if nil != title
 296         @title = title.content
 297       end
 298     end
 299
 300     #---------------------------------------
 301     # Description
 302
 303     descrNodes = opfDoc.css('dc|description', 'dc' => @@DC_NS_URL)
 304     if (descrNodes.length > 0)
 305       descrNode = descrNodes[0]
 306       if nil != descrNode
 307         @description = descrNode.content
 308       end
 309     end
 310
 311     #---------------------------------------
 312     # Other metadata:  series, volume, cover
 313
 314     metas = opfDoc.css('package metadata meta')
 315     for m in metas
 316       name = m['name']
 317       content = m['content']
 318
 319       if 'calibre:series' == name
 320         # TODO:  Dynamically create a new series?
 321         # @series_id = content
 322       elsif 'calibre:series-index' == name
 323         @volume = content
 324       elsif 'cover' == name
 325         coverId = content
 326         #puts 'File ' + @path + ' coverId ' + coverId
 327       end
 328     end
 329
 330     #---------------
 331     # Load the cover
 332
 333     @cover = load_cover(zipfile, opfPath, opfDoc, coverId)
 334   end
 335
 336   protected
 337   def load_cover(zipfile, opfPath, opfDoc, coverId)
 338     coverFile = nil
 339     if nil == coverId
 340       coverId = "cover-image"
 341     end
 342
 343     items = opfDoc.css('package manifest item')
 344     for i in items
 345       href = i['href']
 346       id = i['id']
 347       mimeType = i['media-type']
 348
 349       if coverId == id
 350         entry = zipfile.find_entry(href)
 351
 352         if nil == entry
 353           # Although the epub standard requires the path to be relative
 354           # to the base of the epub (zip), some books encountered in the
 355           # wild have been found to use a bath relative to the location
 356           # of the opf file.
 357           parts = opfPath.split('/')
 358           opfBasePath = opfPath.split('/')[0..-2].join('/')
 359           coverPath = opfBasePath + '/' + href
 360           entry = zipfile.find_entry(coverPath)
 361         end
 362
 363         unless entry
 364           # Another case found in the wild:  cover image is at the root, but path is '../cover.jpeg'
 365           if href.start_with? '../'
 366             coverPath = href[3..-1]
 367             entry = zipfile.find_entry(coverPath)
 368           end
 369         end
 370
 371         if nil == entry
 372           puts 'WARNING!  Cover image "' + href + '" not found in file "' + @path + '".'
 373           return nil
 374         else
 375           entry.get_input_stream() do |is|
 376             return Cover.new(is, href, mimeType)
 377           end
 378         end
 379       end
 380     end
 381     return nil
 382   end
 383 end
 384