From ad4b08a7ee90cea8a88a6d79e27ece2cb164cbf4 Mon Sep 17 00:00:00 2001 From: Chris Jaekl Date: Sat, 22 Jun 2024 11:37:41 -0400 Subject: [PATCH] Remove OCLC classification client code OCLC discontinued their open-access classify service in January 2024. Thus, there's no point in maintaining this code anymore. --- classify/bookclass.rb | 80 --------------------- classify/classset.rb | 122 -------------------------------- classify/fastset.rb | 65 ----------------- classify/lookup.rb | 158 ------------------------------------------ classify/queryoclc.rb | 58 ---------------- 5 files changed, 483 deletions(-) delete mode 100644 classify/bookclass.rb delete mode 100644 classify/classset.rb delete mode 100644 classify/fastset.rb delete mode 100644 classify/lookup.rb delete mode 100644 classify/queryoclc.rb diff --git a/classify/bookclass.rb b/classify/bookclass.rb deleted file mode 100644 index 47cad99..0000000 --- a/classify/bookclass.rb +++ /dev/null @@ -1,80 +0,0 @@ -# Classification information for a single book - -class BookClass - def initialize(grouping, title) - @author = nil - @ddc = nil - @grouping = grouping - @fast = [] - @filename = [] - @lcc = nil - @title = title - end - - def author - @author - end - def author=(value) - @author = value - end - def ddc - @ddc - end - def ddc=(value) - @ddc = value - end - def fast - @fast - end - def filename - @filename - end - def filename=(value) - @filename = value - end - def grouping - @grouping - end - def lcc - @lcc - end - def lcc=(value) - @lcc = value - end - def title - @title - end - - def add_fast(id) - @fast.push(id) - end - - def inspect - data = [] - - if nil != @author_name - data.push('author_name="' + @author_name + '"') - end - if nil != @ddc - data.push('ddc="' + @ddc + '"') - end - if nil != @grouping - data.push('grouping="' + @grouping + '"') - end - if nil != @fast - data.push('fast=' + @fast.inspect) - end - if nil != @filename - data.push('filename=' + @filename.to_s + '"') - end - if nil != @lcc - data.push('lcc="' + @lcc + '"') - end - if nil != @title - data.push('title="' + @title + '"') - end - - return '(BookClass:' + data.join(',') + ')' - end -end - diff --git a/classify/classset.rb b/classify/classset.rb deleted file mode 100644 index 4613fc7..0000000 --- a/classify/classset.rb +++ /dev/null @@ -1,122 +0,0 @@ -require 'csv' - -require 'bookclass' - -class ClassSet - @@class_csv_file = 'class.csv' - - def initialize - @entries = {} - load!(@@class_csv_file) - end - - def add!(info) - key = construct_key(info.grouping, info.title) - @entries[key] = info - end - - def construct_key(author_grouping, title) - author_grouping.to_s + '|' + title.to_s - end - - def get(author_grouping, title) - key = construct_key(author_grouping, title) - if @entries.has_key?(key) - return @entries[key] - else - return nil - end - end - - def has_key?(author_grouping, title) - @entries.has_key?(construct_key(author_grouping, title)) - end - - def ensure_contains!(info) - if ! has_key?(info.grouping, info.title) - add!(info) - end - end - - def inspect - data = [] - - if nil != @entries - data.push('entries=' + @entries.inspect + '') - end - - return '(ClassSet:' + data.join(',') + ')' - end - - def load!(file_name) - first = true - @entries = {} - - if ! File.exist?(file_name) - puts 'WARNING: file "' + file_name + '" not found.' - return - end - - File.open(file_name, 'r:UTF-8') do |fd| - csv = CSV.new(fd) - csv.to_a.each do |row| - if first - first = false - elsif row.length >= 6 - ddc = row[0] - lcc = row[1] - grouping = row[2] - author = row[3] - filename = row[4] - title = row[5] - fast = [] - if nil != row[6] - fast = row[6].split(';') - end - - bookclass = BookClass.new(grouping, title) - bookclass.ddc = ddc - bookclass.lcc = lcc - bookclass.author = author - bookclass.filename = filename - - fast.each do |id| - bookclass.add_fast(id) - end - - key = construct_key(grouping, title) - @entries[key] = bookclass - - #puts 'LOADED[' + key.inspect + ']: ' + bookclass.inspect - end - end - end - end - - def save(file_name) - CSV.open(file_name, 'w:UTF-8') do |csv| - csv << ['Dewey', 'LCC', 'Grouping', 'Author', 'Filename', 'Title', 'FAST'] - - @entries.keys.sort.each do |key| - info = @entries[key] - - #puts 'SAVING[' + key.inspect + ']: ' + info.inspect - - ddc = info.ddc - lcc = info.lcc - grouping = info.grouping - author = info.author - filename = info.filename - title = info.title - fast = info.fast.join(';') - - csv << [ ddc, lcc, grouping, author, filename, title, fast ] - end - end - end - - def save_state - save(@@class_csv_file) - end -end - diff --git a/classify/fastset.rb b/classify/fastset.rb deleted file mode 100644 index 0d97aeb..0000000 --- a/classify/fastset.rb +++ /dev/null @@ -1,65 +0,0 @@ - -class FastSet - @@fast_csv_file = 'fast.csv' - - def initialize - @entries = {} - load!(@@fast_csv_file) - end - - def add(id, descr) - @entries[id] = descr - end - - def has_key?(value) - @entries.has_key?(value) - end - - def inspect - data = [] - - if nil != @entries - data.push('entries=' + @entries.inspect + '') - end - - return '(FastSet:' + data.join(',') + ')' - end - - def load!(file_name) - first = true - @entries = {} - - if ! File.exist?(file_name) - puts 'WARNING: file "' + file_name + '" not found.' - return - end - - first = true - CSV.open(file_name, 'r:UTF-8') do |csv| - csv.to_a.each do |row| - if first - first = false - elsif row.length >= 2 - id = row[0] - descr = row[1] - @entries[id] = descr - end - end - end - end - - def save(file_name) - CSV.open(file_name, 'w:UTF-8') do |csv| - csv << [ 'Code', 'Description' ] - - @entries.keys.sort.each do |key| - csv << [ key.to_s, @entries[key].to_s ] - end - end - end - - def save_state - save(@@fast_csv_file) - end -end - diff --git a/classify/lookup.rb b/classify/lookup.rb deleted file mode 100644 index 889db51..0000000 --- a/classify/lookup.rb +++ /dev/null @@ -1,158 +0,0 @@ -require 'erb' -require 'net/http' -require 'nokogiri' - -require 'bookclass' -require 'classset' -require 'fastset' - -class Lookup - def initialize - @class_set = ClassSet.new() - @fast_set = FastSet.new() - end - - def construct_url(params) - first = true - cmd = 'http://classify.oclc.org/classify2/Classify' - - params += [ ['summary', 'false' ] ] - - params.each do |tuple| - name, value = tuple - if (first) - cmd += '?' - first = false - else - cmd += '&' - end - cmd += name + '=' - cmd += ERB::Util.url_encode(value) - end - - return cmd - end - - def isUpper?(c) - return /[[:upper:]]/.match(c) - end - - def lookup(author_grouping, pathname) - params = [ - ['author', massage_author(author_grouping)], - ['title', massage_title(pathname)] - ] - - cmd = construct_url(params) - res = submit_request(cmd) - - doc = Nokogiri::XML(res.body) - - if "4" == response_code(doc) - # Multiple matches; pick the first one and re-query - owi = doc.css("works work")[0]["owi"] - - params = [ - ['owi', owi] - ] - cmd = construct_url(params) - res = submit_request(cmd) - - #puts res.body - - doc = Nokogiri::XML(res.body) - end - - if "2" != response_code(doc) - # Lookup failed. Let's try shortening the title, if it's in multiple parts - - #TODO TODO - puts "Lookup failed" - return nil - end - - title = doc.css("classify editions edition")[0]["title"] - - info = BookClass.new(author_grouping, title) - - author = doc.css("classify editions edition")[0]["author"] - info.author = author - - nodes = doc.css("classify recommendations ddc mostPopular") - if nil != nodes && nodes.length > 0 - ddc = nodes[0]["sfa"] - info.ddc = ddc - end - - nodes = doc.css("classify recommendations lcc mostPopular") - if nil != nodes && nodes.length > 0 - lcc = nodes[0]["sfa"] - end - info.lcc = lcc - - headings = doc.css("classify recommendations fast headings heading") - headings.each do |heading| - #puts heading.inspect - id = heading['ident'] - #puts 'ID: ' + id - descr = heading.content - #puts 'DESCR: ' + descr - info.add_fast(id) - @fast_set.add(id, descr) - end - - info.filename = title - - @class_set.ensure_contains!(info) - - return info - end - - def massage_author(input) - if nil == input - return nil - end - - reading_order = "" - input.each_char do |c| - if isUpper?(c) and (reading_order.length > 0) - reading_order += " " - end - reading_order += c - end - - return reading_order - end - - def massage_title(pathname) - basename = File.basename(pathname, '.*') - - basename.gsub!('_', ' ') - basename.gsub!('--', ': ') - basename.gsub!('-s ', "'s ") - basename.gsub!('s- ', "s' ") - - return basename - end - - def response_code(doc) - return doc.css("classify response")[0]["code"] - end - - def save_state - @class_set.save_state() - @fast_set.save_state() - end - - def submit_request(cmd) - puts ('GET ' + cmd) - - url = URI.parse(cmd) - req = Net::HTTP::Get.new(url.to_s) - res = Net::HTTP.start(url.host, url.port) {|http| - http.request(req) - } - return res - end -end - diff --git a/classify/queryoclc.rb b/classify/queryoclc.rb deleted file mode 100644 index a451290..0000000 --- a/classify/queryoclc.rb +++ /dev/null @@ -1,58 +0,0 @@ -require 'classset' -require 'lookup' - -#if ARGV.length != 2 -# puts 'Usage: ruby classify.rb author title' -# exit 1 -#end -# -#author_grouping = ARGV[0] -#title = ARGV[1] - -classset = ClassSet.new() -lookup = Lookup.new() - -CSV.open('unclassified.csv', 'r:UTF-8') do |csv| - query_count = 0 - first = true - csv.to_a.each do |row| - if first - first = false - elsif row.length >= 2 - author_grouping = row[0] - pathname = row[1] - - info = classset.get(author_grouping, pathname) - if nil == info - query_count += 1 - info = lookup.lookup(author_grouping, pathname) - - puts info.inspect() - - puts 'Saving state...' - classset.save_state() - lookup.save_state() - - sleep_time = 10 + rand(10) - puts 'Pausing for ' + sleep_time.to_s + ' seconds...' - sleep(sleep_time) # Pause between lookup requests, to be polite to the server - end - - if nil != info - classset.ensure_contains!(info) - else - puts 'WARNING: lookup of ' + author_grouping + ', "' + pathname + '" failed.' - File.open('failed.log', 'a:UTF-8') do |fd| - fd.puts(author_grouping.to_s + ',' + pathname.to_s) - end - end - -# if query_count > 5 -# break -# end - end - end -end - -classset.save_state() -lookup.save_state() -- 2.39.2