Improve metadata extraction from epubs, and clean up the display of popups in the...
[quanlib.git] / book.rb
diff --git a/book.rb b/book.rb
index 5f14aed14790523eaa012e07857d650bfb43b1e2..3cf513957c0a2cc4173266857c95952b8cc0bb14 100644 (file)
--- a/book.rb
+++ b/book.rb
@@ -1,10 +1,18 @@
 
-require './author.rb'
+require 'nokogiri'
+require 'zip'
+
+require 'author'
+require 'cover'
 
 class Book
+  @@DC_NS_URL = 'http://purl.org/dc/elements/1.1/'
+
   def initialize(fileName)
     @author = nil
     @cover = nil
+    @description = nil
+    @path = fileName
     @series = nil
     @title = nil
     @volume = nil
@@ -17,6 +25,7 @@ class Book
       return false
     end
 
+    #puts "Filename:  " + fileName.to_s
     lowerName = fileName.downcase()
 
     if lowerName.end_with?(".epub")
@@ -26,8 +35,45 @@ class Book
     return false
   end
 
+  def cover
+    return @cover
+  end
+
+  def description
+    @description
+  end
+
+  def heading
+    result = []
+
+    if nil != @title
+      result.push('<b>' + @title + '</b>')
+    else
+      result.push('<i>(Unknown title)</i>')
+    end
+    if nil != @author
+      result.push('<i>by ' + @author.to_s() + '</i>')
+    end
+    
+    seriesInfo = []
+    if nil != @series
+      seriesInfo.push(@series.to_s)
+    end
+    if nil != @volume
+      seriesInfo.push(@volume.to_s)
+    end
+    if seriesInfo.length > 0
+      result.push(seriesInfo.join(' '))
+    end
+
+    return result.join('<br/>')
+  end
+
   def inspect
     data = []
+    if nil != @author
+      data.push('author="' + @author.to_s + '"')
+    end
     if nil != @series
       data.push('series="' + @series + '"')
     end
@@ -37,12 +83,19 @@ class Book
     if nil != @title
       data.push('title="' + @title + '"')
     end
-    if nil != @author
-      data.push('author="' + @author + '"')
+    if nil != @cover
+      data.push(@cover.inspect())
+    end
+    if nil != @path
+      data.push('path="' + @path + '"')
     end
     return '(Book:' + data.join(',') + ')'
   end
 
+  def path
+    @path
+  end
+
   def to_s
     return inspect()
   end
@@ -107,5 +160,146 @@ class Book
     if parts.length > 1
       @author = massageAuthor(parts[-2])
     end
+
+    if fileName.downcase.end_with?(".epub")
+      scanEpub!(fileName)
+    end
+  end
+
+  protected 
+  def scanEpub!(fileName)
+    #puts 'Scanning "' + fileName.to_s + '"...'
+    begin
+      Zip::File.open(fileName) do |zipfile|
+        entry = zipfile.find_entry('META-INF/container.xml')
+        if nil == entry
+          return
+        end
+        contXml = zipfile.read('META-INF/container.xml')
+        contDoc = Nokogiri::XML(contXml)
+        opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path']
+
+        scanOpf!(zipfile, opfPath)
+      end
+    rescue Zip::Error => exc
+      puts 'ERROR processing file "' + fileName + '":'
+      puts exc.message
+      puts exc.backtrace
+    end
+  end
+
+  protected
+  def scanOpf!(zipfile, opfPath)
+    coverId = nil
+
+    opfXml = zipfile.read(opfPath)
+    opfDoc = Nokogiri::XML(opfXml)
+
+    #-------
+    # Author
+
+    creators = opfDoc.css('dc|creator', 'dc' => @@DC_NS_URL)
+    if (creators.length > 0)
+      creator = creators[0]
+      if nil != creator
+        role = creator['opf:role']
+        if 'aut' == role
+          name = creator.content
+          parts = name.split(' ')
+          if parts.length > 1
+            surname = parts[-1]
+            givenNames = parts[0..-2].join(' ')
+            @author = Author.new(surname, givenNames)
+          else
+            @author = Author.new(name, '')
+          end
+        end
+      end
+    end
+
+    #---------------------------------------
+    # Title
+
+    titles = opfDoc.css('dc|title', 'dc' => @@DC_NS_URL)
+    if titles.length > 0
+      title = titles[0]
+      if nil != title
+        @title = title.content
+      end
+    end
+
+    #---------------------------------------
+    # Description
+    
+    descrNodes = opfDoc.css('dc|description', 'dc' => @@DC_NS_URL)
+    if (descrNodes.length > 0)
+      descrNode = descrNodes[0]
+      if nil != descrNode
+        @description = descrNode.content
+      end
+    end
+
+    #---------------------------------------
+    # Other metadata:  series, volume, cover
+
+    metas = opfDoc.css('package metadata meta')
+    for m in metas
+      name = m['name']
+      content = m['content']
+
+      if 'calibre:series' == name
+        @series = content
+      elsif 'calibre:series-index' == name
+        @volume = content
+      elsif 'cover' == name
+        coverId = content
+      end
+    end
+
+    #---------------
+    # Load the cover
+
+    @cover = loadCover(zipfile, opfPath, opfDoc, coverId)
+  end
+
+  protected
+  def loadCover(zipfile, opfPath, opfDoc, coverId)
+    coverFile = nil
+    if nil == coverId
+      coverId = "cover-image"
+    end
+
+    items = opfDoc.css('package manifest item')
+    for i in items
+      href = i['href']
+      id = i['id']
+      mimeType = i['media-type']
+
+      if coverId == id
+        entry = zipfile.find_entry(href)
+
+        if nil == entry
+          # Although the epub standard requires the path to be relative 
+          # to the base of the epub (zip), some books encountered in the
+          # wild have been found to use a bath relative to the location 
+          # of the opf file.
+          parts = opfPath.split('/')
+          opfBasePath = opfPath.split('/')[0..-2].join('/')
+          coverPath = opfBasePath + '/' + href
+          entry = zipfile.find_entry(coverPath)
+        end
+
+        if nil == entry
+          puts 'WARNING!  Cover image "' + href + '" not found in file "' + @path + '".'
+          return nil
+        else
+          entry.get_input_stream() do |is|
+            return Cover.new(is, href, mimeType)
+          end
+        end
+      end
+    end
+    return nil
   end
 end
+