Add `arrived` attribute (file creation timestamp) to books table.

[quanlib.git] / book.rb
diff --git a/book.rb b/book.rb

index 72bb83ad1170f77b29e58279eac85cb32c505146..2b93f4b574c9c45738a3a95a2c0b4f1697083d1c 100644 (file)
--- a/book.rb
+++ b/book.rb
@@ -1,43 +1,115 @@
  
  require 'nokogiri'
+require 'rubygems'
  require 'zip'
  
-require './author'
-require './cover'
+require_relative 'author'
+require_relative 'classification'
+require_relative 'cover'
+require_relative 'store'
  
  class Book
-  def initialize(fileName)
-    @author = nil
-    @cover = nil
-    @path = fileName
-    @series = nil
-    @title = nil
-    @volume = nil
+  @@DC_NS_URL = 'http://purl.org/dc/elements/1.1/'
+  @@SERIES_AND_VOLUME_REGEX = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/
+
+  attr_accessor :arrived
+  attr_accessor :author
+  attr_accessor :classification_id
+  attr_accessor :cover
+  attr_accessor :description
+  attr_accessor :language
+  attr_accessor :path
+  attr_accessor :series_id
+  attr_accessor :title
+  attr_accessor :volume
+
+  def initialize(store)
+    @store = store
+  end
  
-    parseFileName!(fileName)
+  def load_from_file!(fileName)
+    @path = fileName
+    parse_file_name!(fileName)
    end
  
-  def self.canHandle?(fileName)
+  def self.can_handle?(fileName)
      if nil == fileName
        return false
      end
  
+    #puts "Filename:  " + fileName.to_s
      lowerName = fileName.downcase()
  
      if lowerName.end_with?(".epub")
        return true
      end
  
+    if lowerName.end_with?(".pdf")
+      return true
+    end
+
      return false
    end
  
+  def self.grouping_for_title(title)
+    result = title
+
+    '\'",!#'.split('').each do |c|
+      result = result.gsub(c, '-')
+    end
+    result = result.gsub(/: */, '--')
+    result = result.gsub(' ', '_')
+
+    result
+  end
+
+  def heading
+    result = []
+
+    if nil != @title
+      result.push('<b>' + @title + '</b>')
+    else
+      result.push('<i>(Unknown title)</i>')
+    end
+    if nil != @author
+      result.push('<i>by ' + @author.reading_order + '</i>')
+    end
+
+    seriesInfo = []
+    series = @store.load_series(@series_id)
+    if nil != series and nil != series.descr
+      seriesInfo.push(series.descr.to_s)
+    end
+    if nil != @volume
+      seriesInfo.push(@volume.to_s)
+    end
+    if seriesInfo.length > 0
+      result.push(seriesInfo.join(' '))
+    end
+
+    classification = nil
+    if nil != @classification_id
+      classification = @store.load_classification(@classification_id)
+    end
+    if nil != classification
+      if nil != classification.ddc
+        result.push('Dewey: ' + classification.ddc.to_s)
+      end
+      if nil != classification.lcc
+        result.push('LCC: ' + classification.lcc.to_s)
+      end
+    end
+
+    return result.join('<br/>')
+  end
+
    def inspect
      data = []
      if nil != @author
-      data.push('author="' + @author.to_s + '"')
+      data.push('author="' + @author.inspect + '"')
      end
-    if nil != @series
-      data.push('series="' + @series + '"')
+    if nil != @series_id
+      data.push('series_id="' + @series_id.to_s() + '"')
      end
      if nil != @volume
        data.push('volume="' + @volume + '"')
@@ -58,26 +130,34 @@ class Book
      return inspect()
    end
  
+  def title_grouping
+    if nil == @path
+      return nil
+    end
+
+    return File.basename(@path, '.*')
+  end
+
    protected
    def isUpper?(c)
      return /[[:upper:]]/.match(c)
    end
  
    protected
-  def massageAuthor(input)
+  def massage_author(input)
      if nil == input
        return nil
      end
  
-    result = ""
+    reading_order = ""
      input.each_char do |c|
-      if isUpper?(c) and (result.length > 0)
-        result += " "
-      end
-      result += c
+      if isUpper?(c) and (reading_order.length > 0)
+        reading_order += " "
+     end
+      reading_order += c
      end
-    
-    return result
+
+    return reading_order
    end
  
    # Returns (series, volumeNo, titleText)
@@ -93,7 +173,7 @@ class Book
      vol = nil
  
      first = arr[0]
-    matchData = (arr[0]).match(/^([A-Z]+)([0-9]+)$/)
+    matchData = (arr[0]).match(@@SERIES_AND_VOLUME_REGEX)
      if nil != matchData
        capt = matchData.captures
        series = capt[0]
@@ -108,33 +188,98 @@ class Book
  
      title = arr.join(' ')
  
+    bare_title_grouping = title_grouping
+      .split('_')
+      .reject { |part| part.match(@@SERIES_AND_VOLUME_REGEX) }
+      .join('_')
+
+    unless bare_title_grouping == Book.grouping_for_title(title)
+      puts "WARNING:  title_grouping mismatch:  #{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}"
+    end
+
      return series, vol, title
    end
  
    protected
-  def parseFileName!(fileName)
-    parts = fileName.split('/')
-    (@series, @volume, @title) = processTitle(parts[-1])
+  def parse_file_name!(file_name)
+    category = nil   # e.g., non-fiction, fan-fiction
+    grouping = ''
+
+    parts = file_name.split('/')
+    (series_code, @volume, @title) = processTitle(parts[-1])
      if parts.length > 1
-      @author = massageAuthor(parts[-2])
+      grouping = parts[-2]
+      reading_order = massage_author(grouping)
+      sort_order = nil
+      @author = Author.new(grouping, reading_order, sort_order)
+      @series_id = @store.get_series(grouping, series_code)
+    end
+    if parts.length > 2
+      category = parts[-3]
+    end
+
+    lc_file_name = file_name.downcase
+    if lc_file_name.end_with?(".epub")
+      scanEpub!(file_name)
+    elsif lc_file_name.end_with?(".pdf")
+      scan_pdf!(file_name)
      end
  
-    if fileName.downcase.end_with?(".epub")
-      scanEpub!(fileName)
+    @arrived = File.ctime(file_name)
+
+    @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*'))
+
+    # TODO:  Fix horrible hard-coded strings and paths
+    if ('01_nonfic' == category) && (nil == classification_id)
+      open(Store.unclassified_csv, 'a') do |fd|
+        fd.puts('"' + grouping.to_s + '","' + path + '"')
+      end
      end
    end
  
-  protected 
+  protected
    def scanEpub!(fileName)
-    Zip::File.open(fileName) do |zipfile|
-      contXml = zipfile.read('META-INF/container.xml')
-      contDoc = Nokogiri::XML(contXml)
-      opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path']
+    #puts 'Scanning "' + fileName.to_s + '"...'
+    begin
+      Zip.warn_invalid_date = false
+      Zip::File.open(fileName) do |zipfile|
+        entry = zipfile.find_entry('META-INF/container.xml')
+        if nil == entry
+          puts 'No META-INF/container.xml, skipping book ' + fileName
+          return
+        end
+        contXml = zipfile.read('META-INF/container.xml')
+        contDoc = Nokogiri::XML(contXml)
+        opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path']
+
+        scanOpf!(zipfile, opfPath)
+      end
+    rescue Zip::Error => exc
+      puts 'ERROR processing file "' + fileName + '":'
+      puts exc.message
+      puts exc.backtrace
+    end
+  end
+
+  protected
+  def scan_pdf!(file_name)
+    #puts 'Scanning "' + file_name.to_s + '"...'
  
-      scanOpf!(zipfile, opfPath)
+    pdf_path = File.expand_path(file_name).to_s
+    if ! pdf_path.end_with?('.pdf')
+      puts 'Unexpected internal error:  path "' + file_name.to_s + '" does not end with ".pdf".'
+      return
+    end
+
+    jpeg_path = pdf_path[0..-5] + '.jpeg'
+    if File.file?(jpeg_path)
+      File.open(jpeg_path, 'r') do |is|
+        @cover = Cover.new(is, jpeg_path, 'image/jpeg')
+      end
      end
    end
  
+
    protected
    def scanOpf!(zipfile, opfPath)
      coverId = nil
@@ -145,22 +290,58 @@ class Book
      #-------
      # Author
  
-    creator = opfDoc.css('dc|creator', 'dc' => 'http://purl.org/dc/elements/1.1/')
-    if nil != creator
-      roleNode = creator.attr('role')
-      if nil != roleNode
-        role = roleNode.value
+    grouping = @author.grouping
+    reading_order = @author.reading_order
+    sort_order = @author.sort_order
+
+    creators = opfDoc.css('dc|creator', 'dc' => @@DC_NS_URL)
+    if (creators.length > 0)
+      creator = creators[0]
+      if nil != creator
+        role = creator['opf:role']
          if 'aut' == role
-          name = creator.children[0].content
-          parts = name.split(' ')
-          if parts.length > 1
-            surname = parts[-1]
-            givenNames = parts[0..-2].join(' ')
-            @author = Author.new(surname, givenNames)
-          else
-            @author = Author.new(name, '')
+          reading_order = creator.content
+
+          file_as = creator['opf:file-as']
+          if nil != file_as
+            sort_order = file_as
            end
          end
+
+        @author = Author.new(grouping, reading_order, sort_order)
+      end
+    end
+
+    #---------------------------------------
+    # Title
+
+    titles = opfDoc.css('dc|title', 'dc' => @@DC_NS_URL)
+    if titles.length > 0
+      title = titles[0]
+      if nil != title
+        @title = title.content
+      end
+    end
+
+    #---------------------------------------
+    # Description
+
+    descrNodes = opfDoc.css('dc|description', 'dc' => @@DC_NS_URL)
+    if (descrNodes.length > 0)
+      descrNode = descrNodes[0]
+      if nil != descrNode
+        @description = descrNode.content
+      end
+    end
+
+    #---------------------------------------
+    # Language
+
+    langNodes = opfDoc.css('dc|language', 'dc' => @@DC_NS_URL)
+    if (langNodes.length > 0)
+      langNode = langNodes[0]
+      if langNode
+        @language = langNode.content
        end
      end
  
@@ -173,33 +354,68 @@ class Book
        content = m['content']
  
        if 'calibre:series' == name
-        @series = content
+        # TODO:  Dynamically create a new series?
+        # @series_id = content
        elsif 'calibre:series-index' == name
          @volume = content
        elsif 'cover' == name
          coverId = content
+        #puts 'File ' + @path + ' coverId ' + coverId
        end
      end
  
      #---------------
      # Load the cover
  
+    @cover = load_cover(zipfile, opfPath, opfDoc, coverId)
+  end
+
+  protected
+  def load_cover(zipfile, opfPath, opfDoc, coverId)
      coverFile = nil
-    if nil != coverId
-      items = opfDoc.css('package manifest item')
-      for i in items
-        href = i['href']
-        id = i['id']
-        mimeType = i['media-type']
-
-        if coverId == id
-          entry = zipfile.find_entry(href)
+    if nil == coverId
+      coverId = "cover-image"
+    end
+
+    items = opfDoc.css('package manifest item')
+    for i in items
+      href = i['href']
+      id = i['id']
+      mimeType = i['media-type']
+
+      if coverId == id
+        entry = zipfile.find_entry(href)
+
+        if nil == entry
+          # Although the epub standard requires the path to be relative
+          # to the base of the epub (zip), some books encountered in the
+          # wild have been found to use a bath relative to the location
+          # of the opf file.
+          parts = opfPath.split('/')
+          opfBasePath = opfPath.split('/')[0..-2].join('/')
+          coverPath = opfBasePath + '/' + href
+          entry = zipfile.find_entry(coverPath)
+        end
+
+        unless entry
+          # Another case found in the wild:  cover image is at the root, but path is '../cover.jpeg'
+          if href.start_with? '../'
+            coverPath = href[3..-1]
+            entry = zipfile.find_entry(coverPath)
+          end
+        end
+
+        if nil == entry
+          puts 'WARNING!  Cover image "' + href + '" not found in file "' + @path + '".'
+          return nil
+        else
            entry.get_input_stream() do |is|
-            @cover = Cover.new(is, href, mimeType)
+            return Cover.new(is, href, mimeType)
            end
          end
        end
      end
+    return nil
    end
  end