Minor change to oclc lookup heuristic.
authorChris Jaekl <cejaekl@yahoo.com>
Tue, 5 Dec 2017 08:29:42 +0000 (17:29 +0900)
committerChris Jaekl <cejaekl@yahoo.com>
Tue, 5 Dec 2017 08:29:42 +0000 (17:29 +0900)
classify/lookup.rb
classify/queryoclc.rb

index 1fa5faaa9dc6f948610960644f194af496b4fc0c..889db51f48783dce8ca4ce469bb5f0bbee68d6b3 100644 (file)
@@ -37,10 +37,10 @@ class Lookup
       return /[[:upper:]]/.match(c)
   end
   
-  def lookup(author_grouping, title)
+  def lookup(author_grouping, pathname)
     params = [ 
         ['author', massage_author(author_grouping)],
-        ['title', massage_title(title)]
+        ['title', massage_title(pathname)]
       ]
   
     cmd = construct_url(params)
@@ -64,6 +64,9 @@ class Lookup
     end
   
     if "2" != response_code(doc)
+      # Lookup failed.  Let's try shortening the title, if it's in multiple parts
+
+      #TODO TODO
       puts "Lookup failed"
       return nil
     end
@@ -124,18 +127,12 @@ class Lookup
   def massage_title(pathname)
     basename = File.basename(pathname, '.*')
 
-    result = ""
-    basename.each_char do |c|
-      if '_' == c
-        result += ' '
-      elsif '-' == c
-        result += "'"
-      else
-        result += c
-      end
-    end
+    basename.gsub!('_', ' ')
+    basename.gsub!('--', ': ')
+    basename.gsub!('-s ', "'s ")
+    basename.gsub!('s- ', "s' ")
 
-    return result
+    return basename
   end
 
   def response_code(doc)
index b3368040150e034b07a6089282021df2a2d01ed7..a4512909038372cd7a386fccae3151e8ae8c9962 100644 (file)
@@ -20,12 +20,13 @@ CSV.open('unclassified.csv', 'r:UTF-8') do |csv|
       first = false
     elsif row.length >= 2
       author_grouping = row[0]
-      title = row[1]
+      pathname = row[1]
 
-      info = classset.get(author_grouping, title)
+      info = classset.get(author_grouping, pathname)
       if nil == info
         query_count += 1
-        info = lookup.lookup(author_grouping, title)
+        info = lookup.lookup(author_grouping, pathname)
+
         puts info.inspect()
 
         puts 'Saving state...'
@@ -40,9 +41,9 @@ CSV.open('unclassified.csv', 'r:UTF-8') do |csv|
       if nil != info
         classset.ensure_contains!(info)
       else
-        puts 'WARNING:  lookup of ' + author_grouping + ', "' + title + '" failed.'
+        puts 'WARNING:  lookup of ' + author_grouping + ', "' + pathname + '" failed.'
         File.open('failed.log', 'a:UTF-8') do |fd|
-          fd.puts(author_grouping.to_s + ',' + title.to_s)
+          fd.puts(author_grouping.to_s + ',' + pathname.to_s)
         end
       end