Improves handling of non-fiction classification data.
[quanlib.git] / classify / lookup.rb
diff --git a/classify/lookup.rb b/classify/lookup.rb
new file mode 100644 (file)
index 0000000..1fa5faa
--- /dev/null
@@ -0,0 +1,161 @@
+require 'erb'
+require 'net/http'
+require 'nokogiri'
+
+require 'bookclass'
+require 'classset'
+require 'fastset'
+
+class Lookup
+  def initialize
+    @class_set = ClassSet.new()
+    @fast_set = FastSet.new()
+  end
+  def construct_url(params)
+    first = true
+    cmd = 'http://classify.oclc.org/classify2/Classify'
+  
+    params += [ ['summary', 'false' ] ]
+  
+    params.each do |tuple|
+      name, value = tuple
+      if (first)
+        cmd += '?'
+        first = false
+      else
+        cmd += '&'
+      end
+    cmd += name + '='
+      cmd += ERB::Util.url_encode(value)
+    end
+  
+    return cmd
+  end
+  
+  def isUpper?(c)
+      return /[[:upper:]]/.match(c)
+  end
+  
+  def lookup(author_grouping, title)
+    params = [ 
+        ['author', massage_author(author_grouping)],
+        ['title', massage_title(title)]
+      ]
+  
+    cmd = construct_url(params)
+    res = submit_request(cmd)
+  
+    doc = Nokogiri::XML(res.body)
+
+    if "4" == response_code(doc)
+      # Multiple matches; pick the first one and re-query
+      owi = doc.css("works work")[0]["owi"]
+  
+      params = [
+          ['owi', owi]
+        ]
+      cmd = construct_url(params)
+      res = submit_request(cmd)
+    
+      #puts res.body
+    
+      doc = Nokogiri::XML(res.body)
+    end
+  
+    if "2" != response_code(doc)
+      puts "Lookup failed"
+      return nil
+    end
+
+    title = doc.css("classify editions edition")[0]["title"]
+  
+    info = BookClass.new(author_grouping, title)
+
+    author = doc.css("classify editions edition")[0]["author"]
+    info.author = author
+  
+    nodes = doc.css("classify recommendations ddc mostPopular")
+    if nil != nodes && nodes.length > 0
+      ddc = nodes[0]["sfa"]
+      info.ddc = ddc
+    end
+  
+    nodes = doc.css("classify recommendations lcc mostPopular")
+    if nil != nodes && nodes.length > 0
+      lcc = nodes[0]["sfa"]
+    end
+    info.lcc = lcc
+
+    headings = doc.css("classify recommendations fast headings heading")
+    headings.each do |heading|
+      #puts heading.inspect
+      id = heading['ident']
+      #puts 'ID: ' + id
+      descr = heading.content
+      #puts 'DESCR: ' + descr
+      info.add_fast(id)
+      @fast_set.add(id, descr)
+    end
+
+    info.filename = title
+
+    @class_set.ensure_contains!(info)
+
+    return info
+  end
+
+  def massage_author(input)
+      if nil == input
+        return nil
+      end
+  
+      reading_order = ""
+      input.each_char do |c|
+        if isUpper?(c) and (reading_order.length > 0)
+          reading_order += " "
+       end
+        reading_order += c
+      end
+  
+      return reading_order
+  end
+
+  def massage_title(pathname)
+    basename = File.basename(pathname, '.*')
+
+    result = ""
+    basename.each_char do |c|
+      if '_' == c
+        result += ' '
+      elsif '-' == c
+        result += "'"
+      else
+        result += c
+      end
+    end
+
+    return result
+  end
+
+  def response_code(doc)
+    return doc.css("classify response")[0]["code"]
+  end
+
+  def save_state
+    @class_set.save_state()
+    @fast_set.save_state()
+  end
+
+  def submit_request(cmd)
+    puts ('GET ' + cmd)
+  
+    url = URI.parse(cmd)
+    req = Net::HTTP::Get.new(url.to_s)
+    res = Net::HTTP.start(url.host, url.port) {|http|
+      http.request(req)
+    }
+    return res
+  end
+end
+