From: Chris Jaekl Date: Mon, 8 Jul 2019 23:48:07 +0000 (-0400) Subject: Add extract.rb: extract library books to .txt (for indexing). X-Git-Url: https://jaekl.net/gitweb/?a=commitdiff_plain;h=3b386f9b3645e051764a58db3705c1f71e542ee8;p=quanlib.git Add extract.rb: extract library books to .txt (for indexing). --- diff --git a/extract.rb b/extract.rb new file mode 100644 index 0000000..c695941 --- /dev/null +++ b/extract.rb @@ -0,0 +1,50 @@ +require 'find' +require 'pathname' + +def exec(cmdline) + puts "$ #{cmdline}" + result = system(cmdline) + unless result + puts "FAILED: #{cmdline}" + end + result +end + +def extract_epub(source_file, source_path, dest_path) + relative_path = source_file[source_path.length .. source_file.length] + dest_file = "#{dest_path}/#{relative_path}" + dest_file = dest_file[0 .. (dest_file.length - 6)] + ".txt" + + required_path = Pathname(dest_file).dirname + unless File.directory? required_path + unless exec("mkdir -p #{required_path}") + return false + end + end + + if File.exist? dest_file + source_time = File.mtime source_file + dest_time = File.mtime dest_file + comp = dest_time <=> source_time + if comp > 0 + return true # Nothing to do, extraction is already up-to-date + end + end + + exec("ebook-convert #{source_file} #{dest_file}") +end + +def scan_dir(source_path, dest_path) + Find.find(source_path) do |f| + if f.match(/.epub\Z/) + unless (f.match(/_bis.epub\Z/) || f.match(/_ter.epub\Z/) || f.match(/_quater.epub\Z/)) + extract_epub(f, source_path, dest_path) + end + end + end +end + +dest_path = ARGV[0] +for arg in ARGV[1 .. ARGV.length] + scan_dir(arg, dest_path) +end