Improves handling of non-fiction classification data.
authorChris Jaekl <cejaekl@yahoo.com>
Fri, 7 Jul 2017 12:54:38 +0000 (21:54 +0900)
committerChris Jaekl <cejaekl@yahoo.com>
Fri, 7 Jul 2017 12:54:38 +0000 (21:54 +0900)
book.rb
classify/bookclass.rb
classify/classify.rb [deleted file]
classify/classset.rb
classify/fastset.rb
classify/lookup.rb [new file with mode: 0644]
main.rb
store.rb
walkdir.rb

diff --git a/book.rb b/book.rb
index b0a1bbff66de2851b9bab47b8ad6eb0d16cc65f3..a94e33cfcffdc34ecbba6876b07e30c5c5d00bd3 100644 (file)
--- a/book.rb
+++ b/book.rb
@@ -240,6 +240,9 @@ class Book
 
   protected
   def parse_file_name!(file_name)
+    category = nil   # e.g., non-fiction, fan-fiction
+    grouping = ''
+
     parts = file_name.split('/')
     (series_code, @volume, @title) = processTitle(parts[-1])
     if parts.length > 1
@@ -249,6 +252,9 @@ class Book
       @author = Author.new(grouping, reading_order, sort_order)
       @series_id = @store.get_series(grouping, series_code)
     end
+    if parts.length > 2
+      category = parts[-3]
+    end
 
     lc_file_name = file_name.downcase
     if lc_file_name.end_with?(".epub")
@@ -258,6 +264,13 @@ class Book
     end
 
     @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*'))
+
+    # TODO:  Fix horrible hard-coded strings and paths
+    if ('01_nonfic' == category) && (nil == classification_id)
+      open(Store.unclassified_csv, 'a') do |fd|
+        fd.puts('"' + grouping.to_s + '","' + path + '"')
+      end 
+    end
   end
 
   protected 
index 20652dca9d00308f33db9129577abc2b474724ae..47cad99898b8e86ed6fa5aabe1f493ca41b3dbd0 100644 (file)
@@ -6,6 +6,7 @@ class BookClass
     @ddc = nil
     @grouping = grouping
     @fast = []
+    @filename = []
     @lcc = nil
     @title = title
   end
@@ -25,6 +26,12 @@ class BookClass
   def fast 
     @fast 
   end
+  def filename
+    @filename
+  end
+  def filename=(value)
+    @filename = value
+  end
   def grouping
     @grouping
   end
@@ -57,6 +64,9 @@ class BookClass
     if nil != @fast 
       data.push('fast=' + @fast.inspect) 
     end
+    if nil != @filename
+      data.push('filename=' + @filename.to_s + '"')
+    end
     if nil != @lcc 
       data.push('lcc="' + @lcc + '"') 
     end
diff --git a/classify/classify.rb b/classify/classify.rb
deleted file mode 100644 (file)
index 9cc28ae..0000000
+++ /dev/null
@@ -1,142 +0,0 @@
-require 'erb'
-require 'net/http'
-require 'nokogiri'
-
-require 'bookclass'
-require 'classset'
-require 'fastset'
-
-class Lookup
-  def initialize
-    @class_set = ClassSet.new()
-    @fast_set = FastSet.new()
-  end
-  def construct_url(params)
-    first = true
-    cmd = 'http://classify.oclc.org/classify2/Classify'
-  
-    params += [ ['summary', 'false' ] ]
-  
-    params.each do |tuple|
-      name, value = tuple
-      if (first)
-        cmd += '?'
-        first = false
-      else
-        cmd += '&'
-      end
-    cmd += name + '='
-      cmd += ERB::Util.url_encode(value)
-    end
-  
-    return cmd
-  end
-  
-  def isUpper?(c)
-      return /[[:upper:]]/.match(c)
-  end
-  
-  def lookup(author_grouping, title)
-    params = [ 
-        ['author', massage_author(author_grouping)],
-        ['title', title]
-      ]
-  
-    cmd = construct_url(params)
-    res = submit_request(cmd)
-  
-    doc = Nokogiri::XML(res.body)
-
-    if "4" == response_code(doc)
-      # Multiple matches; pick the first one and re-query
-      owi = doc.css("works work")[0]["owi"]
-  
-      params = [
-          ['owi', owi]
-        ]
-      cmd = construct_url(params)
-      res = submit_request(cmd)
-    
-      #puts res.body
-    
-      doc = Nokogiri::XML(res.body)
-    end
-  
-    if "2" != response_code(doc)
-      puts "Lookup failed"
-      return nil
-    end
-
-    title = doc.css("classify editions edition")[0]["title"]
-  
-    info = BookClass.new(author_grouping, title)
-
-    author = doc.css("classify editions edition")[0]["author"]
-    info.author = author
-  
-    nodes = doc.css("classify recommendations ddc mostPopular")
-    if nil != nodes && nodes.length > 0
-      ddc = nodes[0]["sfa"]
-      info.ddc = ddc
-    end
-  
-    nodes = doc.css("classify recommendations lcc mostPopular")
-    if nil != nodes && nodes.length > 0
-      lcc = nodes[0]["sfa"]
-    end
-    info.lcc = lcc
-
-    headings = doc.css("classify recommendations fast headings heading")
-    headings.each do |heading|
-      #puts heading.inspect
-      id = heading['ident']
-      #puts 'ID: ' + id
-      descr = heading.content
-      #puts 'DESCR: ' + descr
-      info.add_fast(id)
-      @fast_set.add(id, descr)
-    end
-
-    @class_set.ensure_contains!(info)
-
-    return info
-  end
-
-  def massage_author(input)
-      if nil == input
-        return nil
-      end
-  
-      reading_order = ""
-      input.each_char do |c|
-        if isUpper?(c) and (reading_order.length > 0)
-          reading_order += " "
-       end
-        reading_order += c
-      end
-  
-      return reading_order
-  end
-
-  def response_code(doc)
-    return doc.css("classify response")[0]["code"]
-  end
-
-  def save_state
-    @class_set.save_state()
-    @fast_set.save_state()
-  end
-
-  def submit_request(cmd)
-    puts ('GET ' + cmd)
-  
-    url = URI.parse(cmd)
-    req = Net::HTTP::Get.new(url.to_s)
-    res = Net::HTTP.start(url.host, url.port) {|http|
-      http.request(req)
-    }
-    return res
-  end
-end
-
index 710db7a3c89633c1cc7afbd28839de91b3ecdd59..4613fc746ee8ec5fe5c9d7052e806592ca916a7e 100644 (file)
@@ -67,16 +67,18 @@ class ClassSet
           lcc = row[1]
           grouping = row[2]
           author = row[3]
-          title = row[4]
+          filename = row[4]
+          title = row[5]
           fast = []
-          if nil != row[5]
-            fast = row[5].split(';')
+          if nil != row[6]
+            fast = row[6].split(';')
           end
   
           bookclass = BookClass.new(grouping, title)
           bookclass.ddc = ddc
           bookclass.lcc = lcc
           bookclass.author = author
+          bookclass.filename = filename
   
           fast.each do |id|
             bookclass.add_fast(id)
@@ -84,6 +86,8 @@ class ClassSet
   
           key = construct_key(grouping, title)
           @entries[key] = bookclass
+
+          #puts 'LOADED[' + key.inspect + ']: ' + bookclass.inspect
         end
       end
     end
@@ -91,24 +95,22 @@ class ClassSet
 
   def save(file_name)
     CSV.open(file_name, 'w:UTF-8') do |csv|
-      csv << ['Dewey', 'LCC', 'Grouping', 'Author', 'Title', 'FAST']
+      csv << ['Dewey', 'LCC', 'Grouping', 'Author', 'Filename', 'Title', 'FAST']
 
       @entries.keys.sort.each do |key|
         info = @entries[key]
 
+        #puts 'SAVING[' + key.inspect + ']: ' + info.inspect
+
         ddc = info.ddc
         lcc = info.lcc
         grouping = info.grouping
         author = info.author
+        filename = info.filename
         title = info.title
-        fast_list = info.fast
-        fast_ids = []
-        fast_list.each do |tuple|
-          fast_ids.push(tuple[0])
-        end
-        fast = fast_ids.join(';')
+        fast = info.fast.join(';')
         
-        csv << [ ddc, lcc, grouping, author, title, fast ]
+        csv << [ ddc, lcc, grouping, author, filename, title, fast ]
       end
     end
   end
index fa3883ef65d5aa763d9a17dae7385815fda2e904..0d97aeb1ec8c32087b13e46193b997ede4a55d4b 100644 (file)
@@ -34,24 +34,26 @@ class FastSet
       return 
     end
 
-    File.open(file_name, 'r:UTF-8').each_line do |line|
-      cols = line.split(/,/)
-      if first
-        first = false
-      elsif cols.length > 1
-        id = cols[0]
-        descr = cols[1]
-        @entries[id] = descr
+    first = true
+    CSV.open(file_name, 'r:UTF-8') do |csv|
+      csv.to_a.each do |row|
+        if first
+          first = false
+        elsif row.length >= 2
+          id = row[0]
+          descr = row[1]
+          @entries[id] = descr
+        end
       end
     end
   end
 
   def save(file_name)
-    File.open(file_name, 'w:UTF-8') do |fd|
-      fd.puts('Code,Description')
+    CSV.open(file_name, 'w:UTF-8') do |csv|
+      csv << [ 'Code', 'Description' ]
       
       @entries.keys.sort.each do |key|
-        fd.puts(key.to_s + ',' + @entries[key].to_s)
+        csv << [ key.to_s, @entries[key].to_s ]
       end
     end
   end
diff --git a/classify/lookup.rb b/classify/lookup.rb
new file mode 100644 (file)
index 0000000..1fa5faa
--- /dev/null
@@ -0,0 +1,161 @@
+require 'erb'
+require 'net/http'
+require 'nokogiri'
+
+require 'bookclass'
+require 'classset'
+require 'fastset'
+
+class Lookup
+  def initialize
+    @class_set = ClassSet.new()
+    @fast_set = FastSet.new()
+  end
+  def construct_url(params)
+    first = true
+    cmd = 'http://classify.oclc.org/classify2/Classify'
+  
+    params += [ ['summary', 'false' ] ]
+  
+    params.each do |tuple|
+      name, value = tuple
+      if (first)
+        cmd += '?'
+        first = false
+      else
+        cmd += '&'
+      end
+    cmd += name + '='
+      cmd += ERB::Util.url_encode(value)
+    end
+  
+    return cmd
+  end
+  
+  def isUpper?(c)
+      return /[[:upper:]]/.match(c)
+  end
+  
+  def lookup(author_grouping, title)
+    params = [ 
+        ['author', massage_author(author_grouping)],
+        ['title', massage_title(title)]
+      ]
+  
+    cmd = construct_url(params)
+    res = submit_request(cmd)
+  
+    doc = Nokogiri::XML(res.body)
+
+    if "4" == response_code(doc)
+      # Multiple matches; pick the first one and re-query
+      owi = doc.css("works work")[0]["owi"]
+  
+      params = [
+          ['owi', owi]
+        ]
+      cmd = construct_url(params)
+      res = submit_request(cmd)
+    
+      #puts res.body
+    
+      doc = Nokogiri::XML(res.body)
+    end
+  
+    if "2" != response_code(doc)
+      puts "Lookup failed"
+      return nil
+    end
+
+    title = doc.css("classify editions edition")[0]["title"]
+  
+    info = BookClass.new(author_grouping, title)
+
+    author = doc.css("classify editions edition")[0]["author"]
+    info.author = author
+  
+    nodes = doc.css("classify recommendations ddc mostPopular")
+    if nil != nodes && nodes.length > 0
+      ddc = nodes[0]["sfa"]
+      info.ddc = ddc
+    end
+  
+    nodes = doc.css("classify recommendations lcc mostPopular")
+    if nil != nodes && nodes.length > 0
+      lcc = nodes[0]["sfa"]
+    end
+    info.lcc = lcc
+
+    headings = doc.css("classify recommendations fast headings heading")
+    headings.each do |heading|
+      #puts heading.inspect
+      id = heading['ident']
+      #puts 'ID: ' + id
+      descr = heading.content
+      #puts 'DESCR: ' + descr
+      info.add_fast(id)
+      @fast_set.add(id, descr)
+    end
+
+    info.filename = title
+
+    @class_set.ensure_contains!(info)
+
+    return info
+  end
+
+  def massage_author(input)
+      if nil == input
+        return nil
+      end
+  
+      reading_order = ""
+      input.each_char do |c|
+        if isUpper?(c) and (reading_order.length > 0)
+          reading_order += " "
+       end
+        reading_order += c
+      end
+  
+      return reading_order
+  end
+
+  def massage_title(pathname)
+    basename = File.basename(pathname, '.*')
+
+    result = ""
+    basename.each_char do |c|
+      if '_' == c
+        result += ' '
+      elsif '-' == c
+        result += "'"
+      else
+        result += c
+      end
+    end
+
+    return result
+  end
+
+  def response_code(doc)
+    return doc.css("classify response")[0]["code"]
+  end
+
+  def save_state
+    @class_set.save_state()
+    @fast_set.save_state()
+  end
+
+  def submit_request(cmd)
+    puts ('GET ' + cmd)
+  
+    url = URI.parse(cmd)
+    req = Net::HTTP::Get.new(url.to_s)
+    res = Net::HTTP.start(url.host, url.port) {|http|
+      http.request(req)
+    }
+    return res
+  end
+end
+
diff --git a/main.rb b/main.rb
index 6e7a44184ff4b94357e960ef6e5f4c3d6b07a4d1..4ea9c7057de2e456b65eecb2fcad0515b3fbc283 100644 (file)
--- a/main.rb
+++ b/main.rb
@@ -12,6 +12,9 @@ def handleArg(arg)
   if "--purge" == arg
     puts 'Purging database...'
     @store.dropSchema()
+    if File.exists?(Store.unclassified_csv)
+      File.delete(Store.unclassified_csv)
+    end
   elsif arg.start_with?("--")
     abort('ERROR:  Unrecognized option "' + arg + '".')
   end
index b1e3d7f7bb1e06b2540689d02a8742c957efc333..4895a5bb7483bbcf474c62daca224eee2227e85c 100644 (file)
--- a/store.rb
+++ b/store.rb
@@ -6,8 +6,14 @@ require 'pg'
 require 'series'
 
 class Store
+  @@BASEPATH = '/arc/quanlib'  # TODO: FIXME: configure this in a sane way
+  @@UNCLASSIFIED_CSV = @@BASEPATH + '/unclassified.csv'
+
+  def self.unclassified_csv
+    @@UNCLASSIFIED_CSV
+  end
+
   def initialize
-    @basepath = '/arc/quanlib' # TODO: FIXME: configure this in a sane way
     @conn = nil
 
     #@dburl = 'dbi:Pg:quanlib:localhost'
@@ -328,7 +334,7 @@ EOS
 
     (efspath, efsname) = construct_efs_path(id)
 
-    fullpath = @basepath + '/efs/' + efspath + '/' + efsname
+    fullpath = @@BASEPATH + '/efs/' + efspath + '/' + efsname
 
     return Cover.new(nil, fullpath, mime_type)
 
@@ -357,7 +363,7 @@ EOS
 
     (efspath, efsname) = construct_efs_path(efs_id)
 
-    efspath = @basepath + '/efs/' + efspath
+    efspath = @@BASEPATH + '/efs/' + efspath
 
     FileUtils.mkdir_p(efspath)
 
@@ -444,7 +450,7 @@ EOS
   def populate_classifications_table
     puts "Populating the Classifications table..."
     first = true
-    CSV.foreach(@basepath + '/csv/class.csv') do |row|
+    CSV.foreach(@@BASEPATH + '/csv/class.csv') do |row|
       if first
         # skip the header row
         first = false
@@ -484,7 +490,7 @@ EOS
   def populate_fast_table
     puts "Populating the FAST table..."
     first = true
-    CSV.foreach(@basepath + '/csv/fast.csv') do |row|
+    CSV.foreach(@@BASEPATH + '/csv/fast.csv') do |row|
       if first
         first = false  # skip the header row
       else
@@ -498,7 +504,7 @@ EOS
 
   def populate_series_table
     puts "Populating the Series table..."
-    CSV.foreach(@basepath + '/csv/series.csv') do |row|
+    CSV.foreach(@@BASEPATH + '/csv/series.csv') do |row|
       id = next_id('series_id')
       sqlInsert = "INSERT INTO Series (id, age, genre, grouping, code, descr) VALUES ($1, $2, $3, $4, $5, $6);"
       args = [id] + row
index fb23fcf56d44d1fd8bbbad76153aabd2debb6898..800b7fd40eb0db41318c85697e8418bb251ff206 100644 (file)
@@ -4,7 +4,7 @@
 #   .../AuthorName/Title_of_the_Awesome_Book.ext
 #
 # Author is given as FirstLast.  For example, 
-# Robert Anson Heinlein is RoberHeinlein, and 
+# Robert Anson Heinlein is RobertHeinlein, and 
 # JKRowling is JoanneRowling.
 #
 # Book titles have spaces replaced with underscores,