From: Chris Jaekl Date: Fri, 7 Jul 2017 12:54:38 +0000 (+0900) Subject: Improves handling of non-fiction classification data. X-Git-Url: http://jaekl.net/gitweb/?p=quanlib.git;a=commitdiff_plain;h=0aeb88ddc91aa8f9fd8d93a8004d5df5094a4613 Improves handling of non-fiction classification data. --- diff --git a/book.rb b/book.rb index b0a1bbf..a94e33c 100644 --- a/book.rb +++ b/book.rb @@ -240,6 +240,9 @@ class Book protected def parse_file_name!(file_name) + category = nil # e.g., non-fiction, fan-fiction + grouping = '' + parts = file_name.split('/') (series_code, @volume, @title) = processTitle(parts[-1]) if parts.length > 1 @@ -249,6 +252,9 @@ class Book @author = Author.new(grouping, reading_order, sort_order) @series_id = @store.get_series(grouping, series_code) end + if parts.length > 2 + category = parts[-3] + end lc_file_name = file_name.downcase if lc_file_name.end_with?(".epub") @@ -258,6 +264,13 @@ class Book end @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*')) + + # TODO: Fix horrible hard-coded strings and paths + if ('01_nonfic' == category) && (nil == classification_id) + open(Store.unclassified_csv, 'a') do |fd| + fd.puts('"' + grouping.to_s + '","' + path + '"') + end + end end protected diff --git a/classify/bookclass.rb b/classify/bookclass.rb index 20652dc..47cad99 100644 --- a/classify/bookclass.rb +++ b/classify/bookclass.rb @@ -6,6 +6,7 @@ class BookClass @ddc = nil @grouping = grouping @fast = [] + @filename = [] @lcc = nil @title = title end @@ -25,6 +26,12 @@ class BookClass def fast @fast end + def filename + @filename + end + def filename=(value) + @filename = value + end def grouping @grouping end @@ -57,6 +64,9 @@ class BookClass if nil != @fast data.push('fast=' + @fast.inspect) end + if nil != @filename + data.push('filename=' + @filename.to_s + '"') + end if nil != @lcc data.push('lcc="' + @lcc + '"') end diff --git a/classify/classify.rb b/classify/classify.rb deleted file mode 100644 index 9cc28ae..0000000 --- a/classify/classify.rb +++ /dev/null @@ -1,142 +0,0 @@ -require 'erb' -require 'net/http' -require 'nokogiri' - -require 'bookclass' -require 'classset' -require 'fastset' - -class Lookup - def initialize - @class_set = ClassSet.new() - @fast_set = FastSet.new() - end - - def construct_url(params) - first = true - cmd = 'http://classify.oclc.org/classify2/Classify' - - params += [ ['summary', 'false' ] ] - - params.each do |tuple| - name, value = tuple - if (first) - cmd += '?' - first = false - else - cmd += '&' - end - cmd += name + '=' - cmd += ERB::Util.url_encode(value) - end - - return cmd - end - - def isUpper?(c) - return /[[:upper:]]/.match(c) - end - - def lookup(author_grouping, title) - params = [ - ['author', massage_author(author_grouping)], - ['title', title] - ] - - cmd = construct_url(params) - res = submit_request(cmd) - - doc = Nokogiri::XML(res.body) - - if "4" == response_code(doc) - # Multiple matches; pick the first one and re-query - owi = doc.css("works work")[0]["owi"] - - params = [ - ['owi', owi] - ] - cmd = construct_url(params) - res = submit_request(cmd) - - #puts res.body - - doc = Nokogiri::XML(res.body) - end - - if "2" != response_code(doc) - puts "Lookup failed" - return nil - end - - title = doc.css("classify editions edition")[0]["title"] - - info = BookClass.new(author_grouping, title) - - author = doc.css("classify editions edition")[0]["author"] - info.author = author - - nodes = doc.css("classify recommendations ddc mostPopular") - if nil != nodes && nodes.length > 0 - ddc = nodes[0]["sfa"] - info.ddc = ddc - end - - nodes = doc.css("classify recommendations lcc mostPopular") - if nil != nodes && nodes.length > 0 - lcc = nodes[0]["sfa"] - end - info.lcc = lcc - - headings = doc.css("classify recommendations fast headings heading") - headings.each do |heading| - #puts heading.inspect - id = heading['ident'] - #puts 'ID: ' + id - descr = heading.content - #puts 'DESCR: ' + descr - info.add_fast(id) - @fast_set.add(id, descr) - end - - @class_set.ensure_contains!(info) - - return info - end - - def massage_author(input) - if nil == input - return nil - end - - reading_order = "" - input.each_char do |c| - if isUpper?(c) and (reading_order.length > 0) - reading_order += " " - end - reading_order += c - end - - return reading_order - end - - def response_code(doc) - return doc.css("classify response")[0]["code"] - end - - def save_state - @class_set.save_state() - @fast_set.save_state() - end - - def submit_request(cmd) - puts ('GET ' + cmd) - - url = URI.parse(cmd) - req = Net::HTTP::Get.new(url.to_s) - res = Net::HTTP.start(url.host, url.port) {|http| - http.request(req) - } - return res - end -end - diff --git a/classify/classset.rb b/classify/classset.rb index 710db7a..4613fc7 100644 --- a/classify/classset.rb +++ b/classify/classset.rb @@ -67,16 +67,18 @@ class ClassSet lcc = row[1] grouping = row[2] author = row[3] - title = row[4] + filename = row[4] + title = row[5] fast = [] - if nil != row[5] - fast = row[5].split(';') + if nil != row[6] + fast = row[6].split(';') end bookclass = BookClass.new(grouping, title) bookclass.ddc = ddc bookclass.lcc = lcc bookclass.author = author + bookclass.filename = filename fast.each do |id| bookclass.add_fast(id) @@ -84,6 +86,8 @@ class ClassSet key = construct_key(grouping, title) @entries[key] = bookclass + + #puts 'LOADED[' + key.inspect + ']: ' + bookclass.inspect end end end @@ -91,24 +95,22 @@ class ClassSet def save(file_name) CSV.open(file_name, 'w:UTF-8') do |csv| - csv << ['Dewey', 'LCC', 'Grouping', 'Author', 'Title', 'FAST'] + csv << ['Dewey', 'LCC', 'Grouping', 'Author', 'Filename', 'Title', 'FAST'] @entries.keys.sort.each do |key| info = @entries[key] + #puts 'SAVING[' + key.inspect + ']: ' + info.inspect + ddc = info.ddc lcc = info.lcc grouping = info.grouping author = info.author + filename = info.filename title = info.title - fast_list = info.fast - fast_ids = [] - fast_list.each do |tuple| - fast_ids.push(tuple[0]) - end - fast = fast_ids.join(';') + fast = info.fast.join(';') - csv << [ ddc, lcc, grouping, author, title, fast ] + csv << [ ddc, lcc, grouping, author, filename, title, fast ] end end end diff --git a/classify/fastset.rb b/classify/fastset.rb index fa3883e..0d97aeb 100644 --- a/classify/fastset.rb +++ b/classify/fastset.rb @@ -34,24 +34,26 @@ class FastSet return end - File.open(file_name, 'r:UTF-8').each_line do |line| - cols = line.split(/,/) - if first - first = false - elsif cols.length > 1 - id = cols[0] - descr = cols[1] - @entries[id] = descr + first = true + CSV.open(file_name, 'r:UTF-8') do |csv| + csv.to_a.each do |row| + if first + first = false + elsif row.length >= 2 + id = row[0] + descr = row[1] + @entries[id] = descr + end end end end def save(file_name) - File.open(file_name, 'w:UTF-8') do |fd| - fd.puts('Code,Description') + CSV.open(file_name, 'w:UTF-8') do |csv| + csv << [ 'Code', 'Description' ] @entries.keys.sort.each do |key| - fd.puts(key.to_s + ',' + @entries[key].to_s) + csv << [ key.to_s, @entries[key].to_s ] end end end diff --git a/classify/lookup.rb b/classify/lookup.rb new file mode 100644 index 0000000..1fa5faa --- /dev/null +++ b/classify/lookup.rb @@ -0,0 +1,161 @@ +require 'erb' +require 'net/http' +require 'nokogiri' + +require 'bookclass' +require 'classset' +require 'fastset' + +class Lookup + def initialize + @class_set = ClassSet.new() + @fast_set = FastSet.new() + end + + def construct_url(params) + first = true + cmd = 'http://classify.oclc.org/classify2/Classify' + + params += [ ['summary', 'false' ] ] + + params.each do |tuple| + name, value = tuple + if (first) + cmd += '?' + first = false + else + cmd += '&' + end + cmd += name + '=' + cmd += ERB::Util.url_encode(value) + end + + return cmd + end + + def isUpper?(c) + return /[[:upper:]]/.match(c) + end + + def lookup(author_grouping, title) + params = [ + ['author', massage_author(author_grouping)], + ['title', massage_title(title)] + ] + + cmd = construct_url(params) + res = submit_request(cmd) + + doc = Nokogiri::XML(res.body) + + if "4" == response_code(doc) + # Multiple matches; pick the first one and re-query + owi = doc.css("works work")[0]["owi"] + + params = [ + ['owi', owi] + ] + cmd = construct_url(params) + res = submit_request(cmd) + + #puts res.body + + doc = Nokogiri::XML(res.body) + end + + if "2" != response_code(doc) + puts "Lookup failed" + return nil + end + + title = doc.css("classify editions edition")[0]["title"] + + info = BookClass.new(author_grouping, title) + + author = doc.css("classify editions edition")[0]["author"] + info.author = author + + nodes = doc.css("classify recommendations ddc mostPopular") + if nil != nodes && nodes.length > 0 + ddc = nodes[0]["sfa"] + info.ddc = ddc + end + + nodes = doc.css("classify recommendations lcc mostPopular") + if nil != nodes && nodes.length > 0 + lcc = nodes[0]["sfa"] + end + info.lcc = lcc + + headings = doc.css("classify recommendations fast headings heading") + headings.each do |heading| + #puts heading.inspect + id = heading['ident'] + #puts 'ID: ' + id + descr = heading.content + #puts 'DESCR: ' + descr + info.add_fast(id) + @fast_set.add(id, descr) + end + + info.filename = title + + @class_set.ensure_contains!(info) + + return info + end + + def massage_author(input) + if nil == input + return nil + end + + reading_order = "" + input.each_char do |c| + if isUpper?(c) and (reading_order.length > 0) + reading_order += " " + end + reading_order += c + end + + return reading_order + end + + def massage_title(pathname) + basename = File.basename(pathname, '.*') + + result = "" + basename.each_char do |c| + if '_' == c + result += ' ' + elsif '-' == c + result += "'" + else + result += c + end + end + + return result + end + + def response_code(doc) + return doc.css("classify response")[0]["code"] + end + + def save_state + @class_set.save_state() + @fast_set.save_state() + end + + def submit_request(cmd) + puts ('GET ' + cmd) + + url = URI.parse(cmd) + req = Net::HTTP::Get.new(url.to_s) + res = Net::HTTP.start(url.host, url.port) {|http| + http.request(req) + } + return res + end +end + diff --git a/main.rb b/main.rb index 6e7a441..4ea9c70 100644 --- a/main.rb +++ b/main.rb @@ -12,6 +12,9 @@ def handleArg(arg) if "--purge" == arg puts 'Purging database...' @store.dropSchema() + if File.exists?(Store.unclassified_csv) + File.delete(Store.unclassified_csv) + end elsif arg.start_with?("--") abort('ERROR: Unrecognized option "' + arg + '".') end diff --git a/store.rb b/store.rb index b1e3d7f..4895a5b 100644 --- a/store.rb +++ b/store.rb @@ -6,8 +6,14 @@ require 'pg' require 'series' class Store + @@BASEPATH = '/arc/quanlib' # TODO: FIXME: configure this in a sane way + @@UNCLASSIFIED_CSV = @@BASEPATH + '/unclassified.csv' + + def self.unclassified_csv + @@UNCLASSIFIED_CSV + end + def initialize - @basepath = '/arc/quanlib' # TODO: FIXME: configure this in a sane way @conn = nil #@dburl = 'dbi:Pg:quanlib:localhost' @@ -328,7 +334,7 @@ EOS (efspath, efsname) = construct_efs_path(id) - fullpath = @basepath + '/efs/' + efspath + '/' + efsname + fullpath = @@BASEPATH + '/efs/' + efspath + '/' + efsname return Cover.new(nil, fullpath, mime_type) @@ -357,7 +363,7 @@ EOS (efspath, efsname) = construct_efs_path(efs_id) - efspath = @basepath + '/efs/' + efspath + efspath = @@BASEPATH + '/efs/' + efspath FileUtils.mkdir_p(efspath) @@ -444,7 +450,7 @@ EOS def populate_classifications_table puts "Populating the Classifications table..." first = true - CSV.foreach(@basepath + '/csv/class.csv') do |row| + CSV.foreach(@@BASEPATH + '/csv/class.csv') do |row| if first # skip the header row first = false @@ -484,7 +490,7 @@ EOS def populate_fast_table puts "Populating the FAST table..." first = true - CSV.foreach(@basepath + '/csv/fast.csv') do |row| + CSV.foreach(@@BASEPATH + '/csv/fast.csv') do |row| if first first = false # skip the header row else @@ -498,7 +504,7 @@ EOS def populate_series_table puts "Populating the Series table..." - CSV.foreach(@basepath + '/csv/series.csv') do |row| + CSV.foreach(@@BASEPATH + '/csv/series.csv') do |row| id = next_id('series_id') sqlInsert = "INSERT INTO Series (id, age, genre, grouping, code, descr) VALUES ($1, $2, $3, $4, $5, $6);" args = [id] + row diff --git a/walkdir.rb b/walkdir.rb index fb23fcf..800b7fd 100644 --- a/walkdir.rb +++ b/walkdir.rb @@ -4,7 +4,7 @@ # .../AuthorName/Title_of_the_Awesome_Book.ext # # Author is given as FirstLast. For example, -# Robert Anson Heinlein is RoberHeinlein, and +# Robert Anson Heinlein is RobertHeinlein, and # JKRowling is JoanneRowling. # # Book titles have spaces replaced with underscores,