From: Chris Jaekl Date: Fri, 24 Feb 2017 14:23:31 +0000 (+0900) Subject: Parse .epub files to extract metadata and cover image. X-Git-Url: https://jaekl.net/gitweb/?a=commitdiff_plain;h=bd862df9f2c494a890357b673a4ef26298515432;p=quanlib.git Parse .epub files to extract metadata and cover image. --- diff --git a/author.rb b/author.rb index 6bfe610..a0a95e8 100644 --- a/author.rb +++ b/author.rb @@ -4,4 +4,12 @@ class Author @surname = surname @givenNames = givenNames end + + def inspect + @givenNames + ' ' + @surname + end + + def to_s + inspect + end end diff --git a/book.rb b/book.rb index 814d890..72bb83a 100644 --- a/book.rb +++ b/book.rb @@ -1,5 +1,9 @@ -require './author.rb' +require 'nokogiri' +require 'zip' + +require './author' +require './cover' class Book def initialize(fileName) @@ -30,7 +34,7 @@ class Book def inspect data = [] if nil != @author - data.push('author="' + @author + '"') + data.push('author="' + @author.to_s + '"') end if nil != @series data.push('series="' + @series + '"') @@ -41,6 +45,9 @@ class Book if nil != @title data.push('title="' + @title + '"') end + if nil != @cover + data.push(@cover.inspect()) + end if nil != @path data.push('path="' + @path + '"') end @@ -111,5 +118,88 @@ class Book if parts.length > 1 @author = massageAuthor(parts[-2]) end + + if fileName.downcase.end_with?(".epub") + scanEpub!(fileName) + end + end + + protected + def scanEpub!(fileName) + Zip::File.open(fileName) do |zipfile| + contXml = zipfile.read('META-INF/container.xml') + contDoc = Nokogiri::XML(contXml) + opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path'] + + scanOpf!(zipfile, opfPath) + end + end + + protected + def scanOpf!(zipfile, opfPath) + coverId = nil + + opfXml = zipfile.read(opfPath) + opfDoc = Nokogiri::XML(opfXml) + + #------- + # Author + + creator = opfDoc.css('dc|creator', 'dc' => 'http://purl.org/dc/elements/1.1/') + if nil != creator + roleNode = creator.attr('role') + if nil != roleNode + role = roleNode.value + if 'aut' == role + name = creator.children[0].content + parts = name.split(' ') + if parts.length > 1 + surname = parts[-1] + givenNames = parts[0..-2].join(' ') + @author = Author.new(surname, givenNames) + else + @author = Author.new(name, '') + end + end + end + end + + #--------------------------------------- + # Other metadata: series, volume, cover + + metas = opfDoc.css('package metadata meta') + for m in metas + name = m['name'] + content = m['content'] + + if 'calibre:series' == name + @series = content + elsif 'calibre:series-index' == name + @volume = content + elsif 'cover' == name + coverId = content + end + end + + #--------------- + # Load the cover + + coverFile = nil + if nil != coverId + items = opfDoc.css('package manifest item') + for i in items + href = i['href'] + id = i['id'] + mimeType = i['media-type'] + + if coverId == id + entry = zipfile.find_entry(href) + entry.get_input_stream() do |is| + @cover = Cover.new(is, href, mimeType) + end + end + end + end end end + diff --git a/main.rb b/main.rb index 98f5a35..76f543e 100644 --- a/main.rb +++ b/main.rb @@ -1,4 +1,4 @@ -require './walkdir.rb' +require './walkdir' books = [] diff --git a/walkdir.rb b/walkdir.rb index b54d0a2..c776dbf 100644 --- a/walkdir.rb +++ b/walkdir.rb @@ -18,7 +18,7 @@ # and Mrs. Pollifax volume 6, On the China Station, is # .../DorothyGilman/P06_On_the_China_Station.epub. -require './book.rb' +require './book' class WalkDir def initialize(root)