From: Chris Jaekl Date: Tue, 5 Dec 2017 08:29:42 +0000 (+0900) Subject: Minor change to oclc lookup heuristic. X-Git-Url: https://jaekl.net/gitweb/?a=commitdiff_plain;h=952e15e8397db7ca1aad1f71e66f529b31cb75ce;p=quanlib.git Minor change to oclc lookup heuristic. --- diff --git a/classify/lookup.rb b/classify/lookup.rb index 1fa5faa..889db51 100644 --- a/classify/lookup.rb +++ b/classify/lookup.rb @@ -37,10 +37,10 @@ class Lookup return /[[:upper:]]/.match(c) end - def lookup(author_grouping, title) + def lookup(author_grouping, pathname) params = [ ['author', massage_author(author_grouping)], - ['title', massage_title(title)] + ['title', massage_title(pathname)] ] cmd = construct_url(params) @@ -64,6 +64,9 @@ class Lookup end if "2" != response_code(doc) + # Lookup failed. Let's try shortening the title, if it's in multiple parts + + #TODO TODO puts "Lookup failed" return nil end @@ -124,18 +127,12 @@ class Lookup def massage_title(pathname) basename = File.basename(pathname, '.*') - result = "" - basename.each_char do |c| - if '_' == c - result += ' ' - elsif '-' == c - result += "'" - else - result += c - end - end + basename.gsub!('_', ' ') + basename.gsub!('--', ': ') + basename.gsub!('-s ', "'s ") + basename.gsub!('s- ', "s' ") - return result + return basename end def response_code(doc) diff --git a/classify/queryoclc.rb b/classify/queryoclc.rb index b336804..a451290 100644 --- a/classify/queryoclc.rb +++ b/classify/queryoclc.rb @@ -20,12 +20,13 @@ CSV.open('unclassified.csv', 'r:UTF-8') do |csv| first = false elsif row.length >= 2 author_grouping = row[0] - title = row[1] + pathname = row[1] - info = classset.get(author_grouping, title) + info = classset.get(author_grouping, pathname) if nil == info query_count += 1 - info = lookup.lookup(author_grouping, title) + info = lookup.lookup(author_grouping, pathname) + puts info.inspect() puts 'Saving state...' @@ -40,9 +41,9 @@ CSV.open('unclassified.csv', 'r:UTF-8') do |csv| if nil != info classset.ensure_contains!(info) else - puts 'WARNING: lookup of ' + author_grouping + ', "' + title + '" failed.' + puts 'WARNING: lookup of ' + author_grouping + ', "' + pathname + '" failed.' File.open('failed.log', 'a:UTF-8') do |fd| - fd.puts(author_grouping.to_s + ',' + title.to_s) + fd.puts(author_grouping.to_s + ',' + pathname.to_s) end end