From: Chris Jaekl Date: Wed, 30 Dec 2020 23:46:15 +0000 (-0500) Subject: Multi-thread the parsing of books to speed up loading. X-Git-Url: https://jaekl.net/gitweb/?a=commitdiff_plain;h=56ed007c7918ec7ea34b431b1f0f4ef9bbb02c56;p=quanlib.git Multi-thread the parsing of books to speed up loading. --- diff --git a/book_loader.rb b/book_loader.rb new file mode 100644 index 0000000..5516f04 --- /dev/null +++ b/book_loader.rb @@ -0,0 +1,28 @@ + +require_relative 'book' +require_relative 'store' + +class BookLoader + DONE_MARKER = '' + + def initialize(config_file, queue) + @config_file = config_file + @queue = queue + end + + def run + @store = Store.new(@config_file) + @store.connect() + + file = @queue.pop + until file == DONE_MARKER do + book = Book.new(@store) + book.load_from_file!(file) + @store.store_book(book) + + file = @queue.pop + end + + @store.disconnect() + end +end diff --git a/main.rb b/main.rb index b0c4b79..e294b4a 100644 --- a/main.rb +++ b/main.rb @@ -1,11 +1,10 @@ require_relative 'navigator' require_relative 'page' require_relative 'store' -require_relative 'walkdir' +require_relative 'walk_dir' @outputDir = 'output' -book_ids = [] @config_file = 'quanlib.ini' @skip_class = false @@ -39,8 +38,8 @@ end for arg in ARGV if ! arg.start_with?("--") puts 'Scanning directory "' + arg + '"...' - w = WalkDir.new(@store, arg) - book_ids += (w.books) + w = WalkDir.new(@config_file, arg) + w.books end end diff --git a/walk_dir.rb b/walk_dir.rb new file mode 100644 index 0000000..a2c088f --- /dev/null +++ b/walk_dir.rb @@ -0,0 +1,118 @@ +# Walk the directory (and subdirectories), identifying books. +# +# Expected format: +# .../AuthorName/Title_of_the_Awesome_Book.ext +# +# Author is given as FirstLast. For example, +# Robert Anson Heinlein is RobertHeinlein, and +# JKRowling is JoanneRowling. +# +# Book titles have spaces replaced with underscores, +# and punctuation [,!?'] replaced with hyphens. +# +# If the book forms part of a series, then an all-capitals +# series designator, followed by a numeric volume number, +# followed by an underscore, is prefixed to the name. +# For example, Hardy Boys' volume 1, The Tower Treasure, +# is rendered as .../FranklinDixon/HB001_The_Tower_Treasure.epub +# and Mrs. Pollifax volume 6, On the China Station, is +# .../DorothyGilman/P06_On_the_China_Station.epub. + +require_relative 'book' +require_relative 'book_loader' +require_relative 'store' + +class WalkDir + def initialize(config_file, root) + @queue = Queue.new + @root = root + @config_file = config_file + @threads = [] + + @files = walk(@root) + end + + def books + @threads = [] + num_threads.times do + @threads << Thread.new do + BookLoader.new(@config_file, @queue).run + end + end + + result = [] + @files = remove_duplicates(@files) + for file in @files.sort() + if Book.can_handle?(file) && (!is_duplicate?(file)) + # Queue this book to be loaded and added to the DB by a BookLoader thread + @queue << file + end + end + + @threads.count.times { @queue << BookLoader::DONE_MARKER } + + @threads.each { |t| t.join } + end + + # Duplicate versions of a text are named + # xxx_suffix.ext + # Where suffix is one of bis, ter, quater, quinquies + # for the 2nd, 3rd, 4th or 5th variant respectively. + def is_duplicate?(file) + s = file.to_s + suffix = ['_bis.', '_ter.', '_quater.', '_quinquies.'] + suffix.each do |pat| + if s.include?(pat) + return true + end + end + + return false + end + + def remove_duplicates(files) + unique = {} + for file in files + if Book.can_handle?(file) + key = File.dirname(file) + '/' + File.basename(file, '.*') + if unique.has_key?(key) + new_ext = File.extname(file) + old_ext = File.extname(unique[key]) + if ('.pdf' == old_ext) && ('.epub' == new_ext) + # Prefer EPUB over PDF + puts 'REPLACED ' + unique[key].to_s + ' with ' + file.to_s + unique[key] = file + else + puts 'DROPPED ' + file.to_s + " because it's superceded by " + unique[key].to_s + end + else + unique[key] = file + end + end + end + + return unique.values + end + + def walk(path) + result = [] + children = Dir.entries(path) + for child in children + fullName = (path.chomp("/")) + "/" + child + if (File.directory?(fullName)) and (child != ".") and (child != "..") and (!File.symlink?(fullName)) + sub = walk(fullName) + if (sub != nil) and (sub.length > 0) + result.concat(sub) + end + elsif (! File.directory?(fullName)) + result.push(fullName) + end + end + return result + end + + def num_threads + # TOOD: make this (auto?) configurable + 12 + end +end diff --git a/walkdir.rb b/walkdir.rb deleted file mode 100644 index bbb56a2..0000000 --- a/walkdir.rb +++ /dev/null @@ -1,101 +0,0 @@ -# Walk the directory (and subdirectories), identifying books. -# -# Expected format: -# .../AuthorName/Title_of_the_Awesome_Book.ext -# -# Author is given as FirstLast. For example, -# Robert Anson Heinlein is RobertHeinlein, and -# JKRowling is JoanneRowling. -# -# Book titles have spaces replaced with underscores, -# and punctuation [,!?'] replaced with hyphens. -# -# If the book forms part of a series, then an all-capitals -# series designator, followed by a numeric volume number, -# followed by an underscore, is prefixed to the name. -# For example, Hardy Boys' volume 1, The Tower Treasure, -# is rendered as .../FranklinDixon/HB001_The_Tower_Treasure.epub -# and Mrs. Pollifax volume 6, On the China Station, is -# .../DorothyGilman/P06_On_the_China_Station.epub. - -require_relative 'book' -require_relative 'store' - -class WalkDir - def initialize(store, root) - @root = root - @store = store - @files = walk(@root) - end - - def books - result = [] - @files = remove_duplicates(@files) - for file in @files.sort() - if Book.can_handle?(file) && (!is_duplicate?(file)) - book = Book.new(@store) - book.load_from_file!(file) - id = @store.store_book(book) - result.push(id) - end - end - return result - end - - # Duplicate versions of a text are named - # xxx_suffix.ext - # Where suffix is one of bis, ter, quater, quinquies - # for the 2nd, 3rd, 4th or 5th variant respectively. - def is_duplicate?(file) - s = file.to_s - suffix = ['_bis.', '_ter.', '_quater.', '_quinquies.'] - suffix.each do |pat| - if s.include?(pat) - return true - end - end - - return false - end - - def remove_duplicates(files) - unique = {} - for file in files - if Book.can_handle?(file) - key = File.dirname(file) + '/' + File.basename(file, '.*') - if unique.has_key?(key) - new_ext = File.extname(file) - old_ext = File.extname(unique[key]) - if ('.pdf' == old_ext) && ('.epub' == new_ext) - # Prefer EPUB over PDF - puts 'REPLACED ' + unique[key].to_s + ' with ' + file.to_s - unique[key] = file - else - puts 'DROPPED ' + file.to_s + " because it's superceded by " + unique[key].to_s - end - else - unique[key] = file - end - end - end - - return unique.values - end - - def walk(path) - result = [] - children = Dir.entries(path) - for child in children - fullName = (path.chomp("/")) + "/" + child - if (File.directory?(fullName)) and (child != ".") and (child != "..") and (!File.symlink?(fullName)) - sub = walk(fullName) - if (sub != nil) and (sub.length > 0) - result.concat(sub) - end - elsif (! File.directory?(fullName)) - result.push(fullName) - end - end - return result - end -end