From 0aeb88ddc91aa8f9fd8d93a8004d5df5094a4613 Mon Sep 17 00:00:00 2001
From: Chris Jaekl <cejaekl@yahoo.com>
Date: Fri, 7 Jul 2017 21:54:38 +0900
Subject: [PATCH] Improves handling of non-fiction classification data.

---
 book.rb                             | 13 +++++++++++++
 classify/bookclass.rb               | 10 ++++++++++
 classify/classset.rb                | 24 +++++++++++++-----------
 classify/fastset.rb                 | 24 +++++++++++++-----------
 classify/{classify.rb => lookup.rb} | 21 ++++++++++++++++++++-
 main.rb                             |  3 +++
 store.rb                            | 18 ++++++++++++------
 walkdir.rb                          |  2 +-
 8 files changed, 85 insertions(+), 30 deletions(-)
 rename classify/{classify.rb => lookup.rb} (89%)

diff --git a/book.rb b/book.rb
index b0a1bbf..a94e33c 100644
--- a/book.rb
+++ b/book.rb
@@ -240,6 +240,9 @@ class Book
 
   protected
   def parse_file_name!(file_name)
+    category = nil   # e.g., non-fiction, fan-fiction
+    grouping = ''
+
     parts = file_name.split('/')
     (series_code, @volume, @title) = processTitle(parts[-1])
     if parts.length > 1
@@ -249,6 +252,9 @@ class Book
       @author = Author.new(grouping, reading_order, sort_order)
       @series_id = @store.get_series(grouping, series_code)
     end
+    if parts.length > 2
+      category = parts[-3]
+    end
 
     lc_file_name = file_name.downcase
     if lc_file_name.end_with?(".epub")
@@ -258,6 +264,13 @@ class Book
     end
 
     @classification_id = @store.find_classification(@author.grouping, File.basename(file_name, '.*'))
+
+    # TODO:  Fix horrible hard-coded strings and paths
+    if ('01_nonfic' == category) && (nil == classification_id)
+      open(Store.unclassified_csv, 'a') do |fd|
+        fd.puts('"' + grouping.to_s + '","' + path + '"')
+      end 
+    end
   end
 
   protected 
diff --git a/classify/bookclass.rb b/classify/bookclass.rb
index 20652dc..47cad99 100644
--- a/classify/bookclass.rb
+++ b/classify/bookclass.rb
@@ -6,6 +6,7 @@ class BookClass
     @ddc = nil
     @grouping = grouping
     @fast = []
+    @filename = []
     @lcc = nil
     @title = title
   end
@@ -25,6 +26,12 @@ class BookClass
   def fast 
     @fast 
   end
+  def filename
+    @filename
+  end
+  def filename=(value)
+    @filename = value
+  end
   def grouping
     @grouping
   end
@@ -57,6 +64,9 @@ class BookClass
     if nil != @fast 
       data.push('fast=' + @fast.inspect) 
     end
+    if nil != @filename
+      data.push('filename=' + @filename.to_s + '"')
+    end
     if nil != @lcc 
       data.push('lcc="' + @lcc + '"') 
     end
diff --git a/classify/classset.rb b/classify/classset.rb
index 710db7a..4613fc7 100644
--- a/classify/classset.rb
+++ b/classify/classset.rb
@@ -67,16 +67,18 @@ class ClassSet
           lcc = row[1]
           grouping = row[2]
           author = row[3]
-          title = row[4]
+          filename = row[4]
+          title = row[5]
           fast = []
-          if nil != row[5]
-            fast = row[5].split(';')
+          if nil != row[6]
+            fast = row[6].split(';')
           end
   
           bookclass = BookClass.new(grouping, title)
           bookclass.ddc = ddc
           bookclass.lcc = lcc
           bookclass.author = author
+          bookclass.filename = filename
   
           fast.each do |id|
             bookclass.add_fast(id)
@@ -84,6 +86,8 @@ class ClassSet
   
           key = construct_key(grouping, title)
           @entries[key] = bookclass
+
+          #puts 'LOADED[' + key.inspect + ']: ' + bookclass.inspect
         end
       end
     end
@@ -91,24 +95,22 @@ class ClassSet
 
   def save(file_name)
     CSV.open(file_name, 'w:UTF-8') do |csv|
-      csv << ['Dewey', 'LCC', 'Grouping', 'Author', 'Title', 'FAST']
+      csv << ['Dewey', 'LCC', 'Grouping', 'Author', 'Filename', 'Title', 'FAST']
 
       @entries.keys.sort.each do |key|
         info = @entries[key]
 
+        #puts 'SAVING[' + key.inspect + ']: ' + info.inspect
+
         ddc = info.ddc
         lcc = info.lcc
         grouping = info.grouping
         author = info.author
+        filename = info.filename
         title = info.title
-        fast_list = info.fast
-        fast_ids = []
-        fast_list.each do |tuple|
-          fast_ids.push(tuple[0])
-        end
-        fast = fast_ids.join(';')
+        fast = info.fast.join(';')
         
-        csv << [ ddc, lcc, grouping, author, title, fast ]
+        csv << [ ddc, lcc, grouping, author, filename, title, fast ]
       end
     end
   end
diff --git a/classify/fastset.rb b/classify/fastset.rb
index fa3883e..0d97aeb 100644
--- a/classify/fastset.rb
+++ b/classify/fastset.rb
@@ -34,24 +34,26 @@ class FastSet
       return 
     end
 
-    File.open(file_name, 'r:UTF-8').each_line do |line|
-      cols = line.split(/,/)
-      if first
-        first = false
-      elsif cols.length > 1
-        id = cols[0]
-        descr = cols[1]
-        @entries[id] = descr
+    first = true
+    CSV.open(file_name, 'r:UTF-8') do |csv|
+      csv.to_a.each do |row|
+        if first
+          first = false
+        elsif row.length >= 2
+          id = row[0]
+          descr = row[1]
+          @entries[id] = descr
+        end
       end
     end
   end
 
   def save(file_name)
-    File.open(file_name, 'w:UTF-8') do |fd|
-      fd.puts('Code,Description')
+    CSV.open(file_name, 'w:UTF-8') do |csv|
+      csv << [ 'Code', 'Description' ]
       
       @entries.keys.sort.each do |key|
-        fd.puts(key.to_s + ',' + @entries[key].to_s)
+        csv << [ key.to_s, @entries[key].to_s ]
       end
     end
   end
diff --git a/classify/classify.rb b/classify/lookup.rb
similarity index 89%
rename from classify/classify.rb
rename to classify/lookup.rb
index 9cc28ae..1fa5faa 100644
--- a/classify/classify.rb
+++ b/classify/lookup.rb
@@ -40,7 +40,7 @@ class Lookup
   def lookup(author_grouping, title)
     params = [ 
         ['author', massage_author(author_grouping)],
-        ['title', title]
+        ['title', massage_title(title)]
       ]
   
     cmd = construct_url(params)
@@ -98,6 +98,8 @@ class Lookup
       @fast_set.add(id, descr)
     end
 
+    info.filename = title
+
     @class_set.ensure_contains!(info)
 
     return info
@@ -119,6 +121,23 @@ class Lookup
       return reading_order
   end
 
+  def massage_title(pathname)
+    basename = File.basename(pathname, '.*')
+
+    result = ""
+    basename.each_char do |c|
+      if '_' == c
+        result += ' '
+      elsif '-' == c
+        result += "'"
+      else
+        result += c
+      end
+    end
+
+    return result
+  end
+
   def response_code(doc)
     return doc.css("classify response")[0]["code"]
   end
diff --git a/main.rb b/main.rb
index 6e7a441..4ea9c70 100644
--- a/main.rb
+++ b/main.rb
@@ -12,6 +12,9 @@ def handleArg(arg)
   if "--purge" == arg
     puts 'Purging database...'
     @store.dropSchema()
+    if File.exists?(Store.unclassified_csv)
+      File.delete(Store.unclassified_csv)
+    end
   elsif arg.start_with?("--")
     abort('ERROR:  Unrecognized option "' + arg + '".')
   end
diff --git a/store.rb b/store.rb
index b1e3d7f..4895a5b 100644
--- a/store.rb
+++ b/store.rb
@@ -6,8 +6,14 @@ require 'pg'
 require 'series'
 
 class Store
+  @@BASEPATH = '/arc/quanlib'	# TODO: FIXME: configure this in a sane way
+  @@UNCLASSIFIED_CSV = @@BASEPATH + '/unclassified.csv'
+
+  def self.unclassified_csv
+    @@UNCLASSIFIED_CSV
+  end
+
   def initialize
-    @basepath = '/arc/quanlib'	# TODO: FIXME: configure this in a sane way
     @conn = nil
 
     #@dburl = 'dbi:Pg:quanlib:localhost'
@@ -328,7 +334,7 @@ EOS
 
     (efspath, efsname) = construct_efs_path(id)
 
-    fullpath = @basepath + '/efs/' + efspath + '/' + efsname
+    fullpath = @@BASEPATH + '/efs/' + efspath + '/' + efsname
 
     return Cover.new(nil, fullpath, mime_type)
 
@@ -357,7 +363,7 @@ EOS
 
     (efspath, efsname) = construct_efs_path(efs_id)
 
-    efspath = @basepath + '/efs/' + efspath
+    efspath = @@BASEPATH + '/efs/' + efspath
 
     FileUtils.mkdir_p(efspath)
 
@@ -444,7 +450,7 @@ EOS
   def populate_classifications_table
     puts "Populating the Classifications table..."
     first = true
-    CSV.foreach(@basepath + '/csv/class.csv') do |row|
+    CSV.foreach(@@BASEPATH + '/csv/class.csv') do |row|
       if first
         # skip the header row
         first = false
@@ -484,7 +490,7 @@ EOS
   def populate_fast_table
     puts "Populating the FAST table..."
     first = true
-    CSV.foreach(@basepath + '/csv/fast.csv') do |row|
+    CSV.foreach(@@BASEPATH + '/csv/fast.csv') do |row|
       if first
         first = false	# skip the header row
       else
@@ -498,7 +504,7 @@ EOS
 
   def populate_series_table
     puts "Populating the Series table..."
-    CSV.foreach(@basepath + '/csv/series.csv') do |row|
+    CSV.foreach(@@BASEPATH + '/csv/series.csv') do |row|
       id = next_id('series_id')
       sqlInsert = "INSERT INTO Series (id, age, genre, grouping, code, descr) VALUES ($1, $2, $3, $4, $5, $6);"
       args = [id] + row
diff --git a/walkdir.rb b/walkdir.rb
index fb23fcf..800b7fd 100644
--- a/walkdir.rb
+++ b/walkdir.rb
@@ -4,7 +4,7 @@
 #   .../AuthorName/Title_of_the_Awesome_Book.ext
 #
 # Author is given as FirstLast.  For example, 
-# Robert Anson Heinlein is RoberHeinlein, and 
+# Robert Anson Heinlein is RobertHeinlein, and 
 # JKRowling is JoanneRowling.
 #
 # Book titles have spaces replaced with underscores,
-- 
2.30.2