From 0aeb88ddc91aa8f9fd8d93a8004d5df5094a4613 Mon Sep 17 00:00:00 2001 From: Chris Jaekl Date: Fri, 7 Jul 2017 21:54:38 +0900 Subject: [PATCH] Improves handling of non-fiction classification data. --- book.rb | 13 +++++++++++++ classify/bookclass.rb | 10 ++++++++++ classify/classset.rb | 24 +++++++++++++----------- classify/fastset.rb | 24 +++++++++++++----------- classify/{classify.rb => lookup.rb} | 21 ++++++++++++++++++++- main.rb | 3 +++ store.rb | 18 ++++++++++++------ walkdir.rb | 2 +- 8 files changed, 85 insertions(+), 30 deletions(-) rename classify/{classify.rb => lookup.rb} (89%) diff --git a/book.rb b/book.rb index b0a1bbf..a94e33c 100644 --- a/book.rb +++ b/book.rb @@ -240,6 +240,9 @@ class Book protected def parse_file_name!(file_name) + category = nil # e.g., non-fiction, fan-fiction + grouping = '' + parts = file_name.split('/') (series_code, @volume, @title) = processTitle(parts[-1]) if parts.length > 1 @@ -249,6 +252,9 @@ class Book @author = Author.new(grouping, reading_order, sort_order) @series_id = @store.get_series(grouping, series_code) end + if parts.length > 2 + category = parts[-3] + end lc_file_name = file_name.downcase if lc_file_name.end_with?(".epub") @@ -258,6 +264,13 @@ class Book end @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*')) + + # TODO: Fix horrible hard-coded strings and paths + if ('01_nonfic' == category) && (nil == classification_id) + open(Store.unclassified_csv, 'a') do |fd| + fd.puts('"' + grouping.to_s + '","' + path + '"') + end + end end protected diff --git a/classify/bookclass.rb b/classify/bookclass.rb index 20652dc..47cad99 100644 --- a/classify/bookclass.rb +++ b/classify/bookclass.rb @@ -6,6 +6,7 @@ class BookClass @ddc = nil @grouping = grouping @fast = [] + @filename = [] @lcc = nil @title = title end @@ -25,6 +26,12 @@ class BookClass def fast @fast end + def filename + @filename + end + def filename=(value) + @filename = value + end def grouping @grouping end @@ -57,6 +64,9 @@ class BookClass if nil != @fast data.push('fast=' + @fast.inspect) end + if nil != @filename + data.push('filename=' + @filename.to_s + '"') + end if nil != @lcc data.push('lcc="' + @lcc + '"') end diff --git a/classify/classset.rb b/classify/classset.rb index 710db7a..4613fc7 100644 --- a/classify/classset.rb +++ b/classify/classset.rb @@ -67,16 +67,18 @@ class ClassSet lcc = row[1] grouping = row[2] author = row[3] - title = row[4] + filename = row[4] + title = row[5] fast = [] - if nil != row[5] - fast = row[5].split(';') + if nil != row[6] + fast = row[6].split(';') end bookclass = BookClass.new(grouping, title) bookclass.ddc = ddc bookclass.lcc = lcc bookclass.author = author + bookclass.filename = filename fast.each do |id| bookclass.add_fast(id) @@ -84,6 +86,8 @@ class ClassSet key = construct_key(grouping, title) @entries[key] = bookclass + + #puts 'LOADED[' + key.inspect + ']: ' + bookclass.inspect end end end @@ -91,24 +95,22 @@ class ClassSet def save(file_name) CSV.open(file_name, 'w:UTF-8') do |csv| - csv << ['Dewey', 'LCC', 'Grouping', 'Author', 'Title', 'FAST'] + csv << ['Dewey', 'LCC', 'Grouping', 'Author', 'Filename', 'Title', 'FAST'] @entries.keys.sort.each do |key| info = @entries[key] + #puts 'SAVING[' + key.inspect + ']: ' + info.inspect + ddc = info.ddc lcc = info.lcc grouping = info.grouping author = info.author + filename = info.filename title = info.title - fast_list = info.fast - fast_ids = [] - fast_list.each do |tuple| - fast_ids.push(tuple[0]) - end - fast = fast_ids.join(';') + fast = info.fast.join(';') - csv << [ ddc, lcc, grouping, author, title, fast ] + csv << [ ddc, lcc, grouping, author, filename, title, fast ] end end end diff --git a/classify/fastset.rb b/classify/fastset.rb index fa3883e..0d97aeb 100644 --- a/classify/fastset.rb +++ b/classify/fastset.rb @@ -34,24 +34,26 @@ class FastSet return end - File.open(file_name, 'r:UTF-8').each_line do |line| - cols = line.split(/,/) - if first - first = false - elsif cols.length > 1 - id = cols[0] - descr = cols[1] - @entries[id] = descr + first = true + CSV.open(file_name, 'r:UTF-8') do |csv| + csv.to_a.each do |row| + if first + first = false + elsif row.length >= 2 + id = row[0] + descr = row[1] + @entries[id] = descr + end end end end def save(file_name) - File.open(file_name, 'w:UTF-8') do |fd| - fd.puts('Code,Description') + CSV.open(file_name, 'w:UTF-8') do |csv| + csv << [ 'Code', 'Description' ] @entries.keys.sort.each do |key| - fd.puts(key.to_s + ',' + @entries[key].to_s) + csv << [ key.to_s, @entries[key].to_s ] end end end diff --git a/classify/classify.rb b/classify/lookup.rb similarity index 89% rename from classify/classify.rb rename to classify/lookup.rb index 9cc28ae..1fa5faa 100644 --- a/classify/classify.rb +++ b/classify/lookup.rb @@ -40,7 +40,7 @@ class Lookup def lookup(author_grouping, title) params = [ ['author', massage_author(author_grouping)], - ['title', title] + ['title', massage_title(title)] ] cmd = construct_url(params) @@ -98,6 +98,8 @@ class Lookup @fast_set.add(id, descr) end + info.filename = title + @class_set.ensure_contains!(info) return info @@ -119,6 +121,23 @@ class Lookup return reading_order end + def massage_title(pathname) + basename = File.basename(pathname, '.*') + + result = "" + basename.each_char do |c| + if '_' == c + result += ' ' + elsif '-' == c + result += "'" + else + result += c + end + end + + return result + end + def response_code(doc) return doc.css("classify response")[0]["code"] end diff --git a/main.rb b/main.rb index 6e7a441..4ea9c70 100644 --- a/main.rb +++ b/main.rb @@ -12,6 +12,9 @@ def handleArg(arg) if "--purge" == arg puts 'Purging database...' @store.dropSchema() + if File.exists?(Store.unclassified_csv) + File.delete(Store.unclassified_csv) + end elsif arg.start_with?("--") abort('ERROR: Unrecognized option "' + arg + '".') end diff --git a/store.rb b/store.rb index b1e3d7f..4895a5b 100644 --- a/store.rb +++ b/store.rb @@ -6,8 +6,14 @@ require 'pg' require 'series' class Store + @@BASEPATH = '/arc/quanlib' # TODO: FIXME: configure this in a sane way + @@UNCLASSIFIED_CSV = @@BASEPATH + '/unclassified.csv' + + def self.unclassified_csv + @@UNCLASSIFIED_CSV + end + def initialize - @basepath = '/arc/quanlib' # TODO: FIXME: configure this in a sane way @conn = nil #@dburl = 'dbi:Pg:quanlib:localhost' @@ -328,7 +334,7 @@ EOS (efspath, efsname) = construct_efs_path(id) - fullpath = @basepath + '/efs/' + efspath + '/' + efsname + fullpath = @@BASEPATH + '/efs/' + efspath + '/' + efsname return Cover.new(nil, fullpath, mime_type) @@ -357,7 +363,7 @@ EOS (efspath, efsname) = construct_efs_path(efs_id) - efspath = @basepath + '/efs/' + efspath + efspath = @@BASEPATH + '/efs/' + efspath FileUtils.mkdir_p(efspath) @@ -444,7 +450,7 @@ EOS def populate_classifications_table puts "Populating the Classifications table..." first = true - CSV.foreach(@basepath + '/csv/class.csv') do |row| + CSV.foreach(@@BASEPATH + '/csv/class.csv') do |row| if first # skip the header row first = false @@ -484,7 +490,7 @@ EOS def populate_fast_table puts "Populating the FAST table..." first = true - CSV.foreach(@basepath + '/csv/fast.csv') do |row| + CSV.foreach(@@BASEPATH + '/csv/fast.csv') do |row| if first first = false # skip the header row else @@ -498,7 +504,7 @@ EOS def populate_series_table puts "Populating the Series table..." - CSV.foreach(@basepath + '/csv/series.csv') do |row| + CSV.foreach(@@BASEPATH + '/csv/series.csv') do |row| id = next_id('series_id') sqlInsert = "INSERT INTO Series (id, age, genre, grouping, code, descr) VALUES ($1, $2, $3, $4, $5, $6);" args = [id] + row diff --git a/walkdir.rb b/walkdir.rb index fb23fcf..800b7fd 100644 --- a/walkdir.rb +++ b/walkdir.rb @@ -4,7 +4,7 @@ # .../AuthorName/Title_of_the_Awesome_Book.ext # # Author is given as FirstLast. For example, -# Robert Anson Heinlein is RoberHeinlein, and +# Robert Anson Heinlein is RobertHeinlein, and # JKRowling is JoanneRowling. # # Book titles have spaces replaced with underscores, -- 2.39.2