From 98322cfcac29db0b3c63de504d62d98f396a2b54 Mon Sep 17 00:00:00 2001 From: Mike Kasberg Date: Thu, 12 May 2022 21:17:24 -0600 Subject: [PATCH] Add Numo/LAPACK SVD Option **Background:** The slow step of LSI is computing the SVD (singular value decomposition) of a matrix. Even with a relatively small collection of documents (say, about 20 blog posts), the native ruby implementation is too slow to be usable (taking hours to complete). To work around this problem, classifier-reborn allows you to optionally use the `gsl` gem to make use of the [Gnu Scientific Library](https://www.gnu.org/software/gsl/) when performing matrix calculations. Computations with this gem perform orders of magnitude faster than the ruby-only matrix implementation, and they're fast enough that using LSI with Jekyll finishes in a reasonable amount of time (seconds). Unfortunately, [rb-gsl](https://github.com/SciRuby/rb-gsl) is unmaintained -- luckily, there's a commit on main that makes it compatible with Ruby 3, but nobody has released the gem so the only way to use rb-gsl with Ruby 3 right now is to specify the git hash in your Gemfile. See https://github.com/SciRuby/rb-gsl/issues/67 Notably, `rb-gsl` depends on [narray](https://github.com/masa16/narray#new-version-is-under-development---rubynumonarray). `narray` is deprecated, and the readme suggests using `Numo::NArray` instead. **Changes:** In this PR, my goal is to provide an alternative matrix implementation that can perform singular value decomposition quickly and works with Ruby 3. Doing so will allow classifier-reborn to be used with Ruby 3 without depending on the unmaintained/unreleased GSL gem. Options for ruby matrix libraries are somewhat limited, but [Numo](https://github.com/ruby-numo) seems to be more actively maintained than rb-gsl, and Numo has a working Ruby 3 implementation that can perform a singular value decomposition. This requires [numo-narray](https://github.com/ruby-numo/numo-narray) and [numo-linalg](https://github.com/ruby-numo/numo-linalg). My goal is to allow users to (optionally) use classifier-reborn with Numo/Lapack the same way they'd use it with GSL. That is, the user should install the `numo-narray` and `numo-linalg` gems (with their required C libraries), and classifier-reborn will detect and use these if they are found. --- .github/workflows/ci.yml | 18 ++++--- .rubocop.yml | 2 +- Gemfile | 7 ++- lib/classifier-reborn/lsi.rb | 64 +++++++++++++++++++---- lib/classifier-reborn/lsi/content_node.rb | 23 +++++--- test/extensions/matrix_test.rb | 2 +- test/extensions/zero_vector_test.rb | 2 +- test/lsi/lsi_test.rb | 8 +-- 8 files changed, 96 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 033395f..13b3a70 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,17 +14,17 @@ on: jobs: ci: - name: "Run Tests (Ruby ${{ matrix.ruby_version }}, GSL: ${{ matrix.gsl }})" + name: "Run Tests (Ruby ${{ matrix.ruby_version }}, Lib: ${{ matrix.matrix_lib }})" runs-on: "ubuntu-latest" env: # See https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby#matrix-of-gemfiles BUNDLE_GEMFILE: ${{ matrix.gemfile }} - LOAD_GSL: ${{ matrix.gsl }} + MATRIX_LIB: ${{ matrix.matrix_lib }} strategy: fail-fast: false matrix: ruby_version: ["2.7", "3.0", "3.1", "jruby-9.3.4.0"] - gsl: [true, false] + matrix_lib: ["none", "gsl", "lapack"] # We use `include` to assign the correct Gemfile for each ruby_version include: - ruby_version: "2.7" @@ -39,17 +39,23 @@ jobs: # Ruby 3.0 does not work with the latest released gsl gem # https://github.com/SciRuby/rb-gsl/issues/67 - ruby_version: "3.0" - gsl: true + matrix_lib: "gsl" # Ruby 3.1 does not work with the latest released gsl gem # https://github.com/SciRuby/rb-gsl/issues/67 - ruby_version: "3.1" - gsl: true + matrix_lib: "gsl" # jruby-9.3.4.0 doesn't easily build the gsl gem on a GitHub worker. Skipping for now. - ruby_version: "jruby-9.3.4.0" - gsl: true + matrix_lib: "gsl" + # jruby-9.3.4.0 doesn't easily build the numo gems on a GitHub worker. Skipping for now. + - ruby_version: "jruby-9.3.4.0" + matrix_lib: "lapack" steps: - name: Checkout Repository uses: actions/checkout@v3 + - name: Install Lapack + if: ${{ matrix.matrix_lib == 'lapack' }} + run: sudo apt-get install -y liblapacke-dev libopenblas-dev - name: "Set up ${{ matrix.label }}" uses: ruby/setup-ruby@v1 with: diff --git a/.rubocop.yml b/.rubocop.yml index d96a1e1..ea8dbc6 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -1,7 +1,7 @@ inherit_from: .rubocop_todo.yml Style/GlobalVars: - AllowedVariables: [$GSL] + AllowedVariables: [$SVD] Naming/MethodName: Exclude: diff --git a/Gemfile b/Gemfile index 7609f70..f6e55e2 100644 --- a/Gemfile +++ b/Gemfile @@ -4,4 +4,9 @@ source 'https://rubygems.org' gemspec name: 'classifier-reborn' # For testing with GSL support & bundle exec -gem 'gsl' if ENV['LOAD_GSL'] == 'true' +gem 'gsl' if ENV['MATRIX_LIB'] == 'gsl' + +if ENV['MATRIX_LIB'] == 'lapack' + gem 'numo-narray' + gem 'numo-linalg' +end diff --git a/lib/classifier-reborn/lsi.rb b/lib/classifier-reborn/lsi.rb index 378f1da..4a371ab 100644 --- a/lib/classifier-reborn/lsi.rb +++ b/lib/classifier-reborn/lsi.rb @@ -4,16 +4,31 @@ # Copyright:: Copyright (c) 2005 David Fayram II # License:: LGPL +# Try to load Numo first - it's the most current and the most well-supported. +# Fall back to GSL. +# Fall back to native vector. begin raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true` + raise LoadError if ENV['GSL'] == 'true' - require 'gsl' # requires https://github.com/SciRuby/rb-gsl - require_relative 'extensions/vector_serialize' - $GSL = true + require 'numo/narray' + require 'numo/linalg' + $SVD = :numo + puts 'Using Numo!' rescue LoadError - $GSL = false - require_relative 'extensions/vector' - require_relative 'extensions/zero_vector' + begin + raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true` + + require 'gsl' # requires https://github.com/SciRuby/rb-gsl + require_relative 'extensions/vector_serialize' + $SVD = :gsl + puts 'Using GSL!' + rescue LoadError + puts 'Using Ruby!' + $SVD = :ruby + require_relative 'extensions/vector' + require_relative 'extensions/zero_vector' + end end require_relative 'lsi/word_list' @@ -140,7 +155,15 @@ def build_index(cutoff = 0.75) doc_list = @items.values tda = doc_list.collect { |node| node.raw_vector_with(@word_list) } - if $GSL + if $SVD == :numo + tdm = Numo::NArray.asarray(tda.map(&:to_a)).transpose + ntdm = numo_build_reduced_matrix(tdm, cutoff) + + ntdm.each_over_axis(1).with_index do |col_vec, i| + doc_list[i].lsi_vector = col_vec + doc_list[i].lsi_norm = col_vec / Numo::Linalg.norm(col_vec) + end + elsif $SVD == :gsl tdm = GSL::Matrix.alloc(*tda).trans ntdm = build_reduced_matrix(tdm, cutoff) @@ -201,7 +224,9 @@ def proximity_array_for_content(doc, &block) content_node = node_for_content(doc, &block) result = @items.keys.collect do |item| - val = if $GSL + val = if $SVD == :numo + content_node.search_vector.dot(@items[item].transposed_search_vector) + elsif $SVD == :gsl content_node.search_vector * @items[item].transposed_search_vector else (Matrix[content_node.search_vector] * @items[item].search_vector)[0] @@ -220,7 +245,8 @@ def proximity_norms_for_content(doc, &block) return [] if needs_rebuild? content_node = node_for_content(doc, &block) - if $GSL && content_node.raw_norm.isnan?.all? + if ($SVD == :gsl && content_node.raw_norm.isnan?.all?) || + ($SVD == :numo && content_node.raw_norm.isnan.all?) puts "There are no documents that are similar to #{doc}" else content_node_norms(content_node) @@ -230,7 +256,9 @@ def proximity_norms_for_content(doc, &block) def content_node_norms(content_node) result = @items.keys.collect do |item| - val = if $GSL + val = if $SVD == :numo + content_node.search_norm.dot(@items[item].search_norm) + elsif $SVD == :gsl content_node.search_norm * @items[item].search_norm.col else (Matrix[content_node.search_norm] * @items[item].search_norm)[0] @@ -332,7 +360,21 @@ def build_reduced_matrix(matrix, cutoff = 0.75) s[ord] = 0.0 if s[ord] < s_cutoff end # Reconstruct the term document matrix, only with reduced rank - u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans + u * ($SVD == :gsl ? GSL::Matrix : ::Matrix).diag(s) * v.trans + end + + def numo_build_reduced_matrix(matrix, cutoff = 0.75) + # OPTIMIZE ME: Consider other drivers/options like sdd. + s, u, vt = Numo::Linalg.svd(matrix, driver: 'svd', job: 'S') + + # TODO: Better than 75% term, please. :\ + s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1] + s.size.times do |ord| + s[ord] = 0.0 if s[ord] < s_cutoff + end + + # Reconstruct the term document matrix, only with reduced rank + u.dot(::Numo::DFloat.eye(s.size) * s).dot(vt) end def node_for_content(item, &block) diff --git a/lib/classifier-reborn/lsi/content_node.rb b/lib/classifier-reborn/lsi/content_node.rb index 91872b8..786dd72 100644 --- a/lib/classifier-reborn/lsi/content_node.rb +++ b/lib/classifier-reborn/lsi/content_node.rb @@ -29,7 +29,11 @@ def search_vector # Method to access the transposed search vector def transposed_search_vector - search_vector.col + if $SVD == :numo + search_vector + else + search_vector.col + end end # Use this to fetch the appropriate search vector in normalized form. @@ -40,7 +44,9 @@ def search_norm # Creates the raw vector out of word_hash using word_list as the # key for mapping the vector space. def raw_vector_with(word_list) - vec = if $GSL + vec = if $SVD == :numo + Numo::DFloat.zeros(word_list.size) + elsif $SVD == :gsl GSL::Vector.alloc(word_list.size) else Array.new(word_list.size, 0) @@ -51,7 +57,9 @@ def raw_vector_with(word_list) end # Perform the scaling transform and force floating point arithmetic - if $GSL + if $SVD == :numo + total_words = vec.sum.to_f + elsif $SVD == :gsl sum = 0.0 vec.each { |v| sum += v } total_words = sum @@ -61,7 +69,7 @@ def raw_vector_with(word_list) total_unique_words = 0 - if $GSL + if [:numo, :gsl].include?($SVD) vec.each { |word| total_unique_words += 1 if word != 0.0 } else total_unique_words = vec.count { |word| word != 0 } @@ -85,12 +93,15 @@ def raw_vector_with(word_list) hash[val] = Math.log(val + 1) / -weighted_total end - vec.collect! do |val| + vec = vec.map do |val| cached_calcs[val] end end - if $GSL + if $SVD == :numo + @raw_norm = vec / Numo::Linalg.norm(vec) + @raw_vector = vec + elsif $SVD == :gsl @raw_norm = vec.normalize @raw_vector = vec else diff --git a/test/extensions/matrix_test.rb b/test/extensions/matrix_test.rb index e142c35..587e221 100644 --- a/test/extensions/matrix_test.rb +++ b/test/extensions/matrix_test.rb @@ -2,7 +2,7 @@ class MatrixTest < Minitest::Test def test_zero_division - skip "extensions/vector is only used by non-GSL implementation" if $GSL + skip "extensions/vector is only used by non-GSL implementation" if $SVD != :ruby matrix = Matrix[[1, 0], [0, 1]] matrix.SV_decomp diff --git a/test/extensions/zero_vector_test.rb b/test/extensions/zero_vector_test.rb index 0ce53ad..95b5d4c 100644 --- a/test/extensions/zero_vector_test.rb +++ b/test/extensions/zero_vector_test.rb @@ -2,7 +2,7 @@ class ZeroVectorTest < Minitest::Test def test_zero? - skip "extensions/zero_vector is only used by non-GSL implementation" if $GSL + skip "extensions/zero_vector is only used by non-GSL implementation" if $SVD != :ruby vec0 = Vector[] vec1 = Vector[0] diff --git a/test/lsi/lsi_test.rb b/test/lsi/lsi_test.rb index 63ca5e0..984445c 100644 --- a/test/lsi/lsi_test.rb +++ b/test/lsi/lsi_test.rb @@ -1,6 +1,8 @@ # frozen_string_literal: true require File.dirname(__FILE__) + '/../test_helper' +# require_relative '../test_helper' +# require 'debug' class LSITest < Minitest::Test def setup @@ -163,7 +165,7 @@ def test_cached_content_node_option end def test_clears_cached_content_node_cache - skip "transposed_search_vector is only used by GSL implementation" unless $GSL + skip "transposed_search_vector is only used by GSL implementation" if $SVD == :ruby lsi = ClassifierReborn::LSI.new(cache_node_vectors: true) lsi.add_item @str1, 'Dog' @@ -191,8 +193,8 @@ def test_keyword_search assert_equal %i[dog text deal], lsi.highest_ranked_stems(@str1) end - def test_invalid_searching_when_using_gsl - skip "Only GSL currently raises invalid search error" unless $GSL + def test_invalid_searching_with_linalg_lib + skip "Only GSL currently raises invalid search error" if $SVD == :ruby lsi = ClassifierReborn::LSI.new lsi.add_item @str1, 'Dog'