6 require_relative 'author'
7 require_relative 'classification'
8 require_relative 'cover'
9 require_relative 'store'
12 @@DC_NS_URL = 'http://purl.org/dc/elements/1.1/'
13 @@SERIES_AND_VOLUME_REGEX = /^([A-Z]+)([0-9]+(\.[0-9]+)?)$/
15 attr_accessor :arrived
17 attr_accessor :classification_id
19 attr_accessor :description
20 attr_accessor :language
22 attr_accessor :series_id
30 def load_from_file!(fileName)
32 parse_file_name!(fileName)
35 def self.can_handle?(fileName)
40 #puts "Filename: " + fileName.to_s
41 lowerName = fileName.downcase()
43 if lowerName.end_with?(".epub")
47 if lowerName.end_with?(".pdf")
54 def self.grouping_for_title(title)
57 '\'",!#'.split('').each do |c|
58 result = result.gsub(c, '-')
60 result = result.gsub(/: */, '--')
61 result = result.gsub(' ', '_')
70 result.push('<b>' + @title + '</b>')
72 result.push('<i>(Unknown title)</i>')
75 result.push('<i>by ' + @author.reading_order + '</i>')
79 series = @store.load_series(@series_id)
80 if nil != series and nil != series.descr
81 seriesInfo.push(series.descr.to_s)
84 seriesInfo.push(@volume.to_s)
86 if seriesInfo.length > 0
87 result.push(seriesInfo.join(' '))
91 if nil != @classification_id
92 classification = @store.load_classification(@classification_id)
94 if nil != classification
95 if nil != classification.ddc
96 result.push('Dewey: ' + classification.ddc.to_s)
98 if nil != classification.lcc
99 result.push('LCC: ' + classification.lcc.to_s)
103 return result.join('<br/>')
109 data.push('author="' + @author.inspect + '"')
112 data.push('series_id="' + @series_id.to_s() + '"')
115 data.push('volume="' + @volume + '"')
118 data.push('title="' + @title + '"')
121 data.push(@cover.inspect())
124 data.push('path="' + @path + '"')
126 return '(Book:' + data.join(',') + ')'
138 return File.basename(@path, '.*')
143 return /[[:upper:]]/.match(c)
147 def massage_author(input)
153 input.each_char do |c|
154 if isUpper?(c) and (reading_order.length > 0)
163 # Returns (series, volumeNo, titleText)
165 def processTitle(input)
170 arr = input.split('_')
176 matchData = (arr[0]).match(@@SERIES_AND_VOLUME_REGEX)
178 capt = matchData.captures
184 pos = arr[-1].rindex('.')
186 arr[-1] = arr[-1].slice(0, pos)
189 title = arr.join(' ')
191 bare_title_grouping = title_grouping
193 .reject { |part| part.match(@@SERIES_AND_VOLUME_REGEX) }
196 unless bare_title_grouping == Book.grouping_for_title(title)
197 puts "WARNING: title_grouping mismatch: #{bare_title_grouping.inspect} vs. #{Book.grouping_for_title(title).inspect}"
200 return series, vol, title
204 def parse_file_name!(file_name)
205 category = nil # e.g., non-fiction, fan-fiction
208 parts = file_name.split('/')
209 (series_code, @volume, @title) = processTitle(parts[-1])
212 reading_order = massage_author(grouping)
214 @author = Author.new(grouping, reading_order, sort_order)
215 @series_id = @store.get_series(grouping, series_code)
221 lc_file_name = file_name.downcase
222 if lc_file_name.end_with?(".epub")
224 elsif lc_file_name.end_with?(".pdf")
228 @arrived = File.ctime(file_name)
230 @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*'))
232 # TODO: Fix horrible hard-coded strings and paths
233 if ('01_nonfic' == category) && (nil == classification_id)
234 open(Store.unclassified_csv, 'a') do |fd|
235 fd.puts('"' + grouping.to_s + '","' + path + '"')
241 def scanEpub!(fileName)
242 #puts 'Scanning "' + fileName.to_s + '"...'
244 Zip.warn_invalid_date = false
245 Zip::File.open(fileName) do |zipfile|
246 entry = zipfile.find_entry('META-INF/container.xml')
248 puts 'No META-INF/container.xml, skipping book ' + fileName
251 contXml = zipfile.read('META-INF/container.xml')
252 contDoc = Nokogiri::XML(contXml)
253 opfPath = contDoc.css("container rootfiles rootfile")[0]['full-path']
255 scanOpf!(zipfile, opfPath)
257 rescue Zip::Error => exc
258 puts 'ERROR processing file "' + fileName + '":'
265 def scan_pdf!(file_name)
266 #puts 'Scanning "' + file_name.to_s + '"...'
268 pdf_path = File.expand_path(file_name).to_s
269 if ! pdf_path.end_with?('.pdf')
270 puts 'Unexpected internal error: path "' + file_name.to_s + '" does not end with ".pdf".'
274 jpeg_path = pdf_path[0..-5] + '.jpeg'
275 if File.file?(jpeg_path)
276 File.open(jpeg_path, 'r') do |is|
277 @cover = Cover.new(is, jpeg_path, 'image/jpeg')
284 def scanOpf!(zipfile, opfPath)
287 opfXml = zipfile.read(opfPath)
288 opfDoc = Nokogiri::XML(opfXml)
293 grouping = @author.grouping
294 reading_order = @author.reading_order
295 sort_order = @author.sort_order
297 creators = opfDoc.css('dc|creator', 'dc' => @@DC_NS_URL)
298 if (creators.length > 0)
299 creator = creators[0]
301 role = creator['opf:role']
303 reading_order = creator.content
305 file_as = creator['opf:file-as']
311 @author = Author.new(grouping, reading_order, sort_order)
315 #---------------------------------------
318 titles = opfDoc.css('dc|title', 'dc' => @@DC_NS_URL)
322 @title = title.content
326 #---------------------------------------
329 descrNodes = opfDoc.css('dc|description', 'dc' => @@DC_NS_URL)
330 if (descrNodes.length > 0)
331 descrNode = descrNodes[0]
333 @description = descrNode.content
337 #---------------------------------------
340 langNodes = opfDoc.css('dc|language', 'dc' => @@DC_NS_URL)
341 if (langNodes.length > 0)
342 langNode = langNodes[0]
344 @language = langNode.content
348 #---------------------------------------
349 # Other metadata: series, volume, cover
351 metas = opfDoc.css('package metadata meta')
354 content = m['content']
356 if 'calibre:series' == name
357 # TODO: Dynamically create a new series?
358 # @series_id = content
359 elsif 'calibre:series-index' == name
361 elsif 'cover' == name
363 #puts 'File ' + @path + ' coverId ' + coverId
370 @cover = load_cover(zipfile, opfPath, opfDoc, coverId)
374 def load_cover(zipfile, opfPath, opfDoc, coverId)
377 coverId = "cover-image"
380 items = opfDoc.css('package manifest item')
384 mimeType = i['media-type']
387 entry = zipfile.find_entry(href)
390 # Although the epub standard requires the path to be relative
391 # to the base of the epub (zip), some books encountered in the
392 # wild have been found to use a bath relative to the location
394 parts = opfPath.split('/')
395 opfBasePath = opfPath.split('/')[0..-2].join('/')
396 coverPath = opfBasePath + '/' + href
397 entry = zipfile.find_entry(coverPath)
401 # Another case found in the wild: cover image is at the root, but path is '../cover.jpeg'
402 if href.start_with? '../'
403 coverPath = href[3..-1]
404 entry = zipfile.find_entry(coverPath)
409 puts 'WARNING! Cover image "' + href + '" not found in file "' + @path + '".'
412 entry.get_input_stream() do |is|
413 return Cover.new(is, href, mimeType)