From 2c6d69af97c152524366d3fefe1808dfb78f8f56 Mon Sep 17 00:00:00 2001 From: Chris Jaekl Date: Wed, 28 Jun 2017 18:05:20 +0900 Subject: [PATCH] Add support for classification of non-fiction books. --- book.rb | 21 ++++- classify/bookclass.rb | 70 ++++++++++++++ classify/classify.rb | 142 +++++++++++++++++++++++++++++ classify/classset.rb | 120 ++++++++++++++++++++++++ classify/fastset.rb | 63 +++++++++++++ classify/queryoclc.rb | 57 ++++++++++++ main.rb | 1 + navigator.rb | 17 +++- store.rb | 206 +++++++++++++++++++++++++++++++++--------- walkdir.rb | 2 +- 10 files changed, 653 insertions(+), 46 deletions(-) create mode 100644 classify/bookclass.rb create mode 100644 classify/classify.rb create mode 100644 classify/classset.rb create mode 100644 classify/fastset.rb create mode 100644 classify/queryoclc.rb diff --git a/book.rb b/book.rb index e0ccc84..5b698e7 100644 --- a/book.rb +++ b/book.rb @@ -11,6 +11,7 @@ class Book def initialize(store) @author = nil + @classification_id = nil @cover = nil @description = nil @path = nil @@ -20,7 +21,7 @@ class Book @volume = nil end - def load_from_file(fileName) + def load_from_file!(fileName) @path = fileName parse_file_name!(fileName) end @@ -52,6 +53,14 @@ class Book @author = value end + def classification_id + @classification_id + end + + def classification_id=(value) + @classification_id = value + end + def cover return @cover end @@ -146,6 +155,14 @@ class Book @title = value end + def title_grouping + if nil == @path + return nil + end + + return File.basename(@path, '.*') + end + def volume @volume end @@ -225,6 +242,8 @@ class Book elsif lc_file_name.end_with?(".pdf") scan_pdf!(file_name) end + + @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*')) end protected diff --git a/classify/bookclass.rb b/classify/bookclass.rb new file mode 100644 index 0000000..20652dc --- /dev/null +++ b/classify/bookclass.rb @@ -0,0 +1,70 @@ +# Classification information for a single book + +class BookClass + def initialize(grouping, title) + @author = nil + @ddc = nil + @grouping = grouping + @fast = [] + @lcc = nil + @title = title + end + + def author + @author + end + def author=(value) + @author = value + end + def ddc + @ddc + end + def ddc=(value) + @ddc = value + end + def fast + @fast + end + def grouping + @grouping + end + def lcc + @lcc + end + def lcc=(value) + @lcc = value + end + def title + @title + end + + def add_fast(id) + @fast.push(id) + end + + def inspect + data = [] + + if nil != @author_name + data.push('author_name="' + @author_name + '"') + end + if nil != @ddc + data.push('ddc="' + @ddc + '"') + end + if nil != @grouping + data.push('grouping="' + @grouping + '"') + end + if nil != @fast + data.push('fast=' + @fast.inspect) + end + if nil != @lcc + data.push('lcc="' + @lcc + '"') + end + if nil != @title + data.push('title="' + @title + '"') + end + + return '(BookClass:' + data.join(',') + ')' + end +end + diff --git a/classify/classify.rb b/classify/classify.rb new file mode 100644 index 0000000..9cc28ae --- /dev/null +++ b/classify/classify.rb @@ -0,0 +1,142 @@ +require 'erb' +require 'net/http' +require 'nokogiri' + +require 'bookclass' +require 'classset' +require 'fastset' + +class Lookup + def initialize + @class_set = ClassSet.new() + @fast_set = FastSet.new() + end + + def construct_url(params) + first = true + cmd = 'http://classify.oclc.org/classify2/Classify' + + params += [ ['summary', 'false' ] ] + + params.each do |tuple| + name, value = tuple + if (first) + cmd += '?' + first = false + else + cmd += '&' + end + cmd += name + '=' + cmd += ERB::Util.url_encode(value) + end + + return cmd + end + + def isUpper?(c) + return /[[:upper:]]/.match(c) + end + + def lookup(author_grouping, title) + params = [ + ['author', massage_author(author_grouping)], + ['title', title] + ] + + cmd = construct_url(params) + res = submit_request(cmd) + + doc = Nokogiri::XML(res.body) + + if "4" == response_code(doc) + # Multiple matches; pick the first one and re-query + owi = doc.css("works work")[0]["owi"] + + params = [ + ['owi', owi] + ] + cmd = construct_url(params) + res = submit_request(cmd) + + #puts res.body + + doc = Nokogiri::XML(res.body) + end + + if "2" != response_code(doc) + puts "Lookup failed" + return nil + end + + title = doc.css("classify editions edition")[0]["title"] + + info = BookClass.new(author_grouping, title) + + author = doc.css("classify editions edition")[0]["author"] + info.author = author + + nodes = doc.css("classify recommendations ddc mostPopular") + if nil != nodes && nodes.length > 0 + ddc = nodes[0]["sfa"] + info.ddc = ddc + end + + nodes = doc.css("classify recommendations lcc mostPopular") + if nil != nodes && nodes.length > 0 + lcc = nodes[0]["sfa"] + end + info.lcc = lcc + + headings = doc.css("classify recommendations fast headings heading") + headings.each do |heading| + #puts heading.inspect + id = heading['ident'] + #puts 'ID: ' + id + descr = heading.content + #puts 'DESCR: ' + descr + info.add_fast(id) + @fast_set.add(id, descr) + end + + @class_set.ensure_contains!(info) + + return info + end + + def massage_author(input) + if nil == input + return nil + end + + reading_order = "" + input.each_char do |c| + if isUpper?(c) and (reading_order.length > 0) + reading_order += " " + end + reading_order += c + end + + return reading_order + end + + def response_code(doc) + return doc.css("classify response")[0]["code"] + end + + def save_state + @class_set.save_state() + @fast_set.save_state() + end + + def submit_request(cmd) + puts ('GET ' + cmd) + + url = URI.parse(cmd) + req = Net::HTTP::Get.new(url.to_s) + res = Net::HTTP.start(url.host, url.port) {|http| + http.request(req) + } + return res + end +end + diff --git a/classify/classset.rb b/classify/classset.rb new file mode 100644 index 0000000..710db7a --- /dev/null +++ b/classify/classset.rb @@ -0,0 +1,120 @@ +require 'csv' + +require 'bookclass' + +class ClassSet + @@class_csv_file = 'class.csv' + + def initialize + @entries = {} + load!(@@class_csv_file) + end + + def add!(info) + key = construct_key(info.grouping, info.title) + @entries[key] = info + end + + def construct_key(author_grouping, title) + author_grouping.to_s + '|' + title.to_s + end + + def get(author_grouping, title) + key = construct_key(author_grouping, title) + if @entries.has_key?(key) + return @entries[key] + else + return nil + end + end + + def has_key?(author_grouping, title) + @entries.has_key?(construct_key(author_grouping, title)) + end + + def ensure_contains!(info) + if ! has_key?(info.grouping, info.title) + add!(info) + end + end + + def inspect + data = [] + + if nil != @entries + data.push('entries=' + @entries.inspect + '') + end + + return '(ClassSet:' + data.join(',') + ')' + end + + def load!(file_name) + first = true + @entries = {} + + if ! File.exist?(file_name) + puts 'WARNING: file "' + file_name + '" not found.' + return + end + + File.open(file_name, 'r:UTF-8') do |fd| + csv = CSV.new(fd) + csv.to_a.each do |row| + if first + first = false + elsif row.length >= 6 + ddc = row[0] + lcc = row[1] + grouping = row[2] + author = row[3] + title = row[4] + fast = [] + if nil != row[5] + fast = row[5].split(';') + end + + bookclass = BookClass.new(grouping, title) + bookclass.ddc = ddc + bookclass.lcc = lcc + bookclass.author = author + + fast.each do |id| + bookclass.add_fast(id) + end + + key = construct_key(grouping, title) + @entries[key] = bookclass + end + end + end + end + + def save(file_name) + CSV.open(file_name, 'w:UTF-8') do |csv| + csv << ['Dewey', 'LCC', 'Grouping', 'Author', 'Title', 'FAST'] + + @entries.keys.sort.each do |key| + info = @entries[key] + + ddc = info.ddc + lcc = info.lcc + grouping = info.grouping + author = info.author + title = info.title + fast_list = info.fast + fast_ids = [] + fast_list.each do |tuple| + fast_ids.push(tuple[0]) + end + fast = fast_ids.join(';') + + csv << [ ddc, lcc, grouping, author, title, fast ] + end + end + end + + def save_state + save(@@class_csv_file) + end +end + diff --git a/classify/fastset.rb b/classify/fastset.rb new file mode 100644 index 0000000..fa3883e --- /dev/null +++ b/classify/fastset.rb @@ -0,0 +1,63 @@ + +class FastSet + @@fast_csv_file = 'fast.csv' + + def initialize + @entries = {} + load!(@@fast_csv_file) + end + + def add(id, descr) + @entries[id] = descr + end + + def has_key?(value) + @entries.has_key?(value) + end + + def inspect + data = [] + + if nil != @entries + data.push('entries=' + @entries.inspect + '') + end + + return '(FastSet:' + data.join(',') + ')' + end + + def load!(file_name) + first = true + @entries = {} + + if ! File.exist?(file_name) + puts 'WARNING: file "' + file_name + '" not found.' + return + end + + File.open(file_name, 'r:UTF-8').each_line do |line| + cols = line.split(/,/) + if first + first = false + elsif cols.length > 1 + id = cols[0] + descr = cols[1] + @entries[id] = descr + end + end + end + + def save(file_name) + File.open(file_name, 'w:UTF-8') do |fd| + fd.puts('Code,Description') + + @entries.keys.sort.each do |key| + fd.puts(key.to_s + ',' + @entries[key].to_s) + end + end + end + + def save_state + save(@@fast_csv_file) + end +end + diff --git a/classify/queryoclc.rb b/classify/queryoclc.rb new file mode 100644 index 0000000..b336804 --- /dev/null +++ b/classify/queryoclc.rb @@ -0,0 +1,57 @@ +require 'classset' +require 'lookup' + +#if ARGV.length != 2 +# puts 'Usage: ruby classify.rb author title' +# exit 1 +#end +# +#author_grouping = ARGV[0] +#title = ARGV[1] + +classset = ClassSet.new() +lookup = Lookup.new() + +CSV.open('unclassified.csv', 'r:UTF-8') do |csv| + query_count = 0 + first = true + csv.to_a.each do |row| + if first + first = false + elsif row.length >= 2 + author_grouping = row[0] + title = row[1] + + info = classset.get(author_grouping, title) + if nil == info + query_count += 1 + info = lookup.lookup(author_grouping, title) + puts info.inspect() + + puts 'Saving state...' + classset.save_state() + lookup.save_state() + + sleep_time = 10 + rand(10) + puts 'Pausing for ' + sleep_time.to_s + ' seconds...' + sleep(sleep_time) # Pause between lookup requests, to be polite to the server + end + + if nil != info + classset.ensure_contains!(info) + else + puts 'WARNING: lookup of ' + author_grouping + ', "' + title + '" failed.' + File.open('failed.log', 'a:UTF-8') do |fd| + fd.puts(author_grouping.to_s + ',' + title.to_s) + end + end + +# if query_count > 5 +# break +# end + end + end +end + +classset.save_state() +lookup.save_state() diff --git a/main.rb b/main.rb index e1a5205..6e7a441 100644 --- a/main.rb +++ b/main.rb @@ -39,6 +39,7 @@ puts 'Creating output...' navigator = Navigator.new(@store) navigator.write_atoz_pages() navigator.write_series_listing() +navigator.write_dewey() @store.disconnect() diff --git a/navigator.rb b/navigator.rb index 16da652..70c13e3 100644 --- a/navigator.rb +++ b/navigator.rb @@ -13,10 +13,11 @@ class Navigator atoz_counts[letter] = write_authors_starting_with(letter) end - content = '' + content = '

AuthorBooks
' ('A'..'Z').each do |letter| content += ' ' end + content += '
AuthorBooks
Starting with ' + letter + '' + atoz_counts[letter].to_s + '

' page = Page.new(@store) page.output_dir = 'atoz' page.special = content @@ -45,6 +46,20 @@ class Navigator return book_ids.length end + def write_dewey + book_ids = @store.query_books_by_ddc() + puts 'Non-fiction books arranged by Dewey Decimal Classification: ' + book_ids.length.to_s() + ' books.' + + page = Page.new(@store) + page.output_dir = 'ddc' + page.index_file = 'index.html' + page.title = "Non-fiction books arranged by Dewey Decimal call number" + page.up = ['../output/index.html', 'Up'] + + page.write_html(book_ids) + return book_ids.length + end + def write_series_for_age(age) series_infos = [] diff --git a/store.rb b/store.rb index 05f4413..660fc80 100644 --- a/store.rb +++ b/store.rb @@ -49,14 +49,28 @@ EOS create_books = < exc + puts 'WARNING: "' + stmt + '" failed: ' + exc.to_s + end end end @@ -176,7 +222,7 @@ EOS def load_book(id) #puts 'DEBUG: load_book(' + id + ')' - sql = "SELECT author, cover, description, path, series, title, volume FROM Books WHERE id=$1;" + sql = "SELECT author, classification, cover, description, path, series, title, volume FROM Books WHERE id=$1;" book = nil begin @@ -189,6 +235,7 @@ EOS book = Book.new(self) book.author = load_author(row['author']) + book.classification_id = row['classification'] book.cover = load_cover(row['cover']) book.description = row['description'] book.path = row['path'] @@ -207,14 +254,14 @@ EOS end def store_book(book) - sql = "INSERT INTO Books (id, author, cover, description, path, series, title, volume) VALUES ($1, $2, $3, $4, $5, $6, $7, $8);" + sql = "INSERT INTO Books (id, author, classification, cover, description, path, series, title, volume) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9);" book_id = next_id('book_id') author_id = store_author(book.author) (efs_id, mime_type) = store_cover(book) - args = [book_id, author_id, efs_id, book.description(), book.path(), book.series_id(), book.title(), book.volume()] + args = [book_id, author_id, book.classification_id, efs_id, book.description(), book.path(), book.series_id(), book.title(), book.volume()] begin rs = @conn.exec_params(sql, args) @@ -229,6 +276,19 @@ EOS return book_id end + def find_classification(author_grouping, title_grouping) + #puts 'find_classification("' + author_grouping.inspect + '", "' + title_grouping.inspect + '")...' + sql = "SELECT id FROM Classifications WHERE author_grouping = $1 AND title_grouping = $2;" + @conn.exec_params(sql, [author_grouping, title_grouping]) do |rs| + if rs.ntuples > 0 + #puts ' --> ' + rs[0]['id'].inspect + return rs[0]['id'] + end + end + #puts ' --> NIL' + return nil + end + def load_cover(id) if nil == id return nil @@ -295,6 +355,28 @@ EOS return efs_id, mimetype end + def exec_id_query(sql, args) + ids = [] + @conn.exec_params(sql, args) do |rs| + rs.each do |row| + ids.push(row['id']) + end + end + return ids + end + + def exec_update(sql, args) + begin + rs = @conn.exec_params(sql, args) + rescue Exception => e + puts sql + ": " + args.inspect() + puts e.message + puts $@ + ensure + rs.clear if rs + end + end + def next_id(seq_name) id = nil @conn.exec("SELECT nextval('" + seq_name + "');") do |rs| @@ -337,22 +419,68 @@ EOS return nil end + def populate_classifications_table + puts "Populating the Classifications table..." + first = true + CSV.foreach(@basepath + '/csv/class.csv') do |row| + if first + # skip the header row + first = false + else + + # First, add a row to the Classifications table + + id = next_id('classification_id') + ddc = row[0] + lcc = row[1] + author_grouping = row[2] + author_sort = row[3] + title_grouping = row[4] + title = row[5] + + sqlInsert = "INSERT INTO Classifications (id, ddc, lcc, author_grouping, author_sort, title_grouping, title) VALUES ($1, $2, $3, $4, $5, $6, $7);" + args = [id, ddc, lcc, author_grouping, author_sort, title_grouping, title] + exec_update(sqlInsert, args) + + # Second, link up with the appropriate FAST table entries + + fast = [] + input = row[6] + if input.length > 0 + fast = input.split(';') + end + + fast.each do |fast_id| + sqlInsert = "INSERT INTO FAST_Classifications (fast, classification) VALUES ($1, $2);" + args = [fast_id, id] + exec_update(sqlInsert, args) + end + end + end + end + + def populate_fast_table + puts "Populating the FAST table..." + first = true + CSV.foreach(@basepath + '/csv/fast.csv') do |row| + if first + first = false # skip the header row + else + id = row[0] + descr = row[1] + sqlInsert = "INSERT INTO FAST (id, descr) VALUES ($1, $2);" + exec_update(sqlInsert, [id, descr]) + end + end + end + def populate_series_table puts "Populating the Series table..." CSV.foreach(@basepath + '/csv/series.csv') do |row| id = next_id('series_id') sqlInsert = "INSERT INTO Series (id, age, genre, grouping, code, descr) VALUES ($1, $2, $3, $4, $5, $6);" args = [id] + row - begin - # DEBUG: puts 'SQL> ' + sqlInsert + ': ' + args.inspect() - rs = @conn.exec_params(sqlInsert, args) - rescue Exception => e - puts sqlInsert + ": " + args.inspect() - puts e.message - puts $@ - ensure - rs.clear if rs - end + exec_update(sqlInsert, args) end end @@ -365,13 +493,17 @@ EOS WHERE upper(a.grouping) LIKE $1 ORDER BY a.grouping, b.series, b.volume, b.title EOS - book_ids = [] - @conn.exec_params(sql, [pattern]) do |rs| - rs.each do |row| - book_ids.push(row['id']) - end - end - return book_ids + return exec_id_query(sql, [pattern]) + end + + def query_books_by_ddc + sql = +<