Make diff batches the default indexing mode

This commit is contained in:
Pixelastic 2018-01-30 15:13:49 +01:00
parent 11567d5b5a
commit 531c90777b
8 changed files with 212 additions and 527 deletions

View File

@ -33,6 +33,8 @@ module Jekyll
exit 1 unless Configurator.assert_valid_credentials
Configurator.warn_of_deprecated_options
if Configurator.dry_run?
Logger.log('W:==== THIS IS A DRY RUN ====')
Logger.log('W: - No records will be pushed to your index')
@ -72,9 +74,10 @@ module Jekyll
# Public: Get access to the time at which the command was run
#
# Jekyll will override some date with the current time, and we'll need to
# keep them as nil, so we have to compare to this date to assume it has been
# overwritten
# Jekyll will always set the updated time of pages to the time of the build
# run. The plugin needs those values to stay at nil if they did not change,
# so we'll keep track of the time at build time and revert any page build at
# that time to nil.
def self.start_time
@start_time
end

View File

@ -12,7 +12,6 @@ module Jekyll
'files_to_exclude' => nil,
'nodes_to_index' => 'p',
'indexing_batch_size' => 1000,
'indexing_mode' => 'diff',
'settings' => {
'distinct' => true,
'attributeForDistinct' => 'url',
@ -129,18 +128,6 @@ module Jekyll
ALGOLIA_DEFAULTS['settings'].merge(user_settings)
end
# Public: Return the current indexing mode
#
# Default mode is `diff`, but users can configure their own by updating
# the `indexing_mode` config in _config.yml. The only other authorized
# value is `atomic`. If an unrecognized mode is defined, it defaults to
# `diff`.
def self.indexing_mode
mode = algolia('indexing_mode') || ALGOLIA_DEFAULTS['indexing_mode']
return 'diff' unless %w[diff atomic].include?(mode)
mode
end
# Public: Check that all credentials are set
#
# Returns true if everything is ok, false otherwise. Will display helpful
@ -197,6 +184,20 @@ module Jekyll
return true if value == true
false
end
# Public: Check for any deprecated config option and warn the user
def self.warn_of_deprecated_options
# indexing_mode is no longer used
return if algolia('indexing_mode').nil?
# rubocop:disable Metrics/LineLength
Logger.log('I:')
Logger.log('W:[jekyll-algolia] You are using the algolia.indexing_mode option which has been deprecated in v1.1')
Logger.log('I: Indexing is now always using an atomic diff algorithm.')
Logger.log('I: This option is no longer necessary, you can remove it from your _config.yml')
Logger.log('I:')
# rubocop:enable Metrics/LineLength
end
end
end
end

View File

@ -45,60 +45,6 @@ module Jekyll
::Algolia::Index.new(index_name)
end
# Public: Check if an index exists
#
# index_name - Name of the index
#
# Note: there is no API endpoint to do that, so we try to get the settings
# instead, which will fail if the index does not exist
def self.index?(index_name)
index(index_name).get_settings
return true
rescue StandardError
return false
end
# Public: Update records of the specified index
#
# index - Algolia Index to update
# records - Array of records to update
#
# New records will be automatically added. Technically existing records
# should be updated but this case should never happen as changing a record
# content will change its objectID as well.
#
# Does nothing in dry run mode
def self.update_records(index, records)
batch_size = Configurator.algolia('indexing_batch_size')
records.each_slice(batch_size) do |batch|
Logger.log("I:Pushing #{batch.size} records")
next if Configurator.dry_run?
begin
index.add_objects!(batch)
rescue StandardError => error
ErrorHandler.stop(error, records: records)
end
end
end
# Public: Delete records whose objectIDs are passed
#
# index - Algolia Index to target
# ids - Array of objectIDs to delete
#
# Does nothing in dry run mode
def self.delete_records_by_id(index, ids)
return if ids.empty?
Logger.log("I:Deleting #{ids.length} records")
return if Configurator.dry_run?
begin
index.delete_objects!(ids)
rescue StandardError => error
ErrorHandler.stop(error)
end
end
# Public: Returns an array of all the objectIDs in the index
#
# index - Algolia Index to target
@ -145,140 +91,47 @@ module Jekyll
end
end
# Public: Index content following the `diff` indexing mode
# Public: Update records of the index
#
# records - Array of local records
#
# The `diff` indexing mode will only push new content to the index and
# remove old content from it. It won't touch records that haven't been
# updated. It will be a bit slower as it will first need to get the list
# of all records in the index, but it will consume less operations.
def self.run_diff_mode(records)
index = index(Configurator.index_name)
# Update settings
update_settings(index, Configurator.settings)
# Getting list of objectID in remote and locally
remote_ids = remote_object_ids(index)
local_ids = local_object_ids(records)
old_records_ids = remote_ids - local_ids
new_records_ids = local_ids - remote_ids
if old_records_ids.empty? && new_records_ids.empty?
Logger.log('I:Nothing to index. Your content is already up to date.')
return
end
Logger.log("I:Updating records in index #{index.name}...")
# Delete remote records that are no longer available locally
delete_records_by_id(index, old_records_ids)
# Add only records that are not yet already in the remote
new_records = records.select do |record|
new_records_ids.include?(record[:objectID])
end
update_records(index, new_records)
Logger.log('I:✔ Indexing complete')
end
# Public: Get the settings of the remote index
#
# index - The Algolia Index
def self.remote_settings(index)
index.get_settings
rescue StandardError => error
ErrorHandler.stop(error)
end
# Public: Rename an index
#
# old_name - Current name of the index
# new_name - New name of the index
# index_name - The Algolia index
# old_records_ids - Ids of records to delete from the index
# new_records - Records to add to the index
#
# Note: All operations will be done in one batch, assuring an atomic
# update
# Does nothing in dry run mode
def self.rename_index(old_name, new_name)
Logger.verbose("I:Renaming `#{old_name}` to `#{new_name}`")
def self.update_records(index_name, old_records_ids, new_records)
Logger.log("I:Records to delete: #{old_records_ids.length}")
Logger.log("I:Records to add: #{new_records.length}")
return if Configurator.dry_run?
begin
::Algolia.move_index!(old_name, new_name)
rescue StandardError => error
ErrorHandler.stop(error, new_name: new_name)
operations = []
old_records_ids.each do |object_id|
operations << {
action: 'deleteObject',
indexName: index_name,
body: {
objectID: object_id
}
}
end
end
# Public: Copy an index
#
# old_name - Current name of the index
# new_name - New name of the index
#
# Does nothing in dry run mode
def self.copy_index(old_name, new_name)
Logger.verbose("I:Copying `#{old_name}` to `#{new_name}`")
return if Configurator.dry_run?
# Stop if no source index
return unless index?(old_name)
begin
::Algolia.copy_index!(old_name, new_name)
rescue StandardError => error
ErrorHandler.stop(error, new_name: new_name)
end
end
# Public: Index content following the `atomic` indexing mode
#
# records - Array of records to push
#
# The `atomic` will first create an hidden copy of the current index.
# It will then update this copy following the same logic as the `diff`
# mode, deleting old records and adding new ones. Once finished, it will
# overwrite the current index with this hidden one.
def self.run_atomic_mode(records)
index_name = Configurator.index_name
index = index(index_name)
index_tmp_name = "#{Configurator.index_name}_tmp"
index_tmp = index(index_tmp_name)
# Getting list of objectID in remote and locally
remote_ids = remote_object_ids(index)
local_ids = local_object_ids(records)
old_records_ids = remote_ids - local_ids
new_records_ids = local_ids - remote_ids
if old_records_ids.empty? && new_records_ids.empty?
Logger.log('I:Nothing to index. Your content is already up to date.')
return
new_records.each do |new_record|
operations << {
action: 'addObject',
indexName: index_name,
body: new_record
}
end
# Copying original index to temporary one
Logger.verbose("I:Using `#{index_tmp_name}` as temporary index")
copy_index(index_name, index_tmp_name)
# Update settings
Logger.verbose("I:Updating `#{index_tmp_name}` settings")
update_settings(index_tmp, Configurator.settings)
Logger.log("I:Updating records in index #{index_tmp_name}...")
# Delete remote records that are no longer available locally
delete_records_by_id(index_tmp, old_records_ids)
# Add only records that are not yet already in the remote
new_records = records.select do |record|
new_records_ids.include?(record[:objectID])
# Run the batches in slices if they are too large
batch_size = Configurator.algolia('indexing_batch_size')
operations.each_slice(batch_size) do |slice|
begin
::Algolia.batch!(slice)
rescue StandardError => error
ErrorHandler.stop(error)
end
end
update_records(index_tmp, new_records)
# Renaming the new index in place of the old
Logger.verbose("I:Overwriting `#{index_name}` with `#{index_tmp_name}`")
rename_index(index_tmp_name, index_name)
Logger.log('I:✔ Indexing complete')
end
# Public: Push all records to Algolia and configure the index
@ -300,9 +153,35 @@ module Jekyll
exit 1
end
indexing_mode = Configurator.indexing_mode
Logger.verbose("I:Indexing mode: #{indexing_mode}")
send("run_#{indexing_mode}_mode".to_sym, records)
index_name = Configurator.index_name
index = index(index_name)
# Update settings
update_settings(index, Configurator.settings)
# Getting list of objectID in remote and locally
remote_ids = remote_object_ids(index)
local_ids = local_object_ids(records)
# Getting list of what to add and what to delete
old_records_ids = remote_ids - local_ids
new_records_ids = local_ids - remote_ids
# Stop if nothing to change
if old_records_ids.empty? && new_records_ids.empty?
Logger.log('I:Nothing to index. Your content is already up to date.')
return
end
Logger.log("I:Updating records in index #{index_name}...")
new_records = []
records.each do |record|
next unless new_records_ids.include?(record[:objectID])
new_records << record
end
update_records(index_name, old_records_ids, new_records)
Logger.log('I:✔ Indexing complete')
end
end
end

View File

@ -4,15 +4,17 @@
require 'spec_helper'
describe(Jekyll::Algolia) do
let(:configurator) { Jekyll::Algolia::Configurator }
let(:current) { Jekyll::Algolia }
let(:indexer) { Jekyll::Algolia::Indexer }
let(:hooks) { Jekyll::Algolia::Hooks }
let(:extractor) { Jekyll::Algolia::Extractor }
let(:logger) { Jekyll::Algolia::Logger }
let(:hooks) { Jekyll::Algolia::Hooks }
let(:indexer) { Jekyll::Algolia::Indexer }
# Suppress Jekyll log about not having a config file
before do
allow(Jekyll.logger).to receive(:warn)
allow(Jekyll::Algolia::Logger).to receive(:log)
allow(logger).to receive(:log)
end
describe '.init' do
@ -22,7 +24,7 @@ describe(Jekyll::Algolia) do
subject { current.init(config) }
before do
allow(Jekyll::Algolia::Configurator)
allow(configurator)
.to receive(:assert_valid_credentials)
.and_return(true)
end
@ -33,12 +35,17 @@ describe(Jekyll::Algolia) do
it 'should make the site accessible from the outside' do
expect(subject.site.config).to include(config)
end
it 'should check for deprecation warnings' do
expect(configurator).to receive(:warn_of_deprecated_options)
current.init(config)
end
end
context 'with invalid Algolia credentials' do
subject { -> { current.init(config) } }
before do
allow(Jekyll::Algolia::Configurator)
allow(configurator)
.to receive(:assert_valid_credentials)
.and_return(false)
end

View File

@ -5,6 +5,7 @@ require 'spec_helper'
describe(Jekyll::Algolia::Configurator) do
let(:current) { Jekyll::Algolia::Configurator }
let(:logger) { Jekyll::Algolia::Logger }
let(:config) { {} }
before do
allow(Jekyll::Algolia).to receive(:config).and_return(config)
@ -231,34 +232,6 @@ describe(Jekyll::Algolia::Configurator) do
end
end
describe 'indexing_mode' do
subject { current.indexing_mode }
before do
allow(current)
.to receive(:algolia)
.with('indexing_mode')
.and_return(indexing_mode)
end
context 'with default values' do
let(:indexing_mode) { nil }
it { should eq 'diff' }
end
context 'with diff selected' do
let(:indexing_mode) { 'diff' }
it { should eq 'diff' }
end
context 'with atomic selected' do
let(:indexing_mode) { 'atomic' }
it { should eq 'atomic' }
end
context 'with an invalid mode selected' do
let(:indexing_mode) { 'chunky_bacon' }
it { should eq 'diff' }
end
end
describe 'dry_run?' do
subject { current.dry_run? }
@ -304,5 +277,33 @@ describe(Jekyll::Algolia::Configurator) do
it { should eq false }
end
end
describe 'warn_of_deprecated_options' do
context 'using indexing_mode' do
before do
allow(current)
.to receive(:algolia)
.with('indexing_mode')
.and_return(indexing_mode)
end
context 'with no value' do
let(:indexing_mode) { nil }
before do
expect(logger).to_not receive(:log)
end
it { current.warn_of_deprecated_options }
end
context 'with a deprecated value' do
let(:indexing_mode) { 'atomic' }
before do
allow(logger).to receive(:log)
expect(logger).to receive(:log).with(/^W/).at_least(:once)
end
it { current.warn_of_deprecated_options }
end
end
end
end
# rubocop:enable Metrics/BlockLength

View File

@ -61,6 +61,8 @@ describe(Jekyll::Algolia::Extractor) do
context 'with a page with divs' do
let(:content) { site.__find_file('only-divs.md').content }
before do
allow(configurator)
.to receive(:algolia)
allow(configurator)
.to receive(:algolia)
.with('nodes_to_index')

View File

@ -5,6 +5,7 @@ require 'spec_helper'
describe(Jekyll::Algolia::FileBrowser) do
let(:current) { Jekyll::Algolia::FileBrowser }
let(:configurator) { Jekyll::Algolia::Configurator }
let(:site) { init_new_jekyll_site }
# Suppress Jekyll log about reading the config file
@ -132,7 +133,9 @@ describe(Jekyll::Algolia::FileBrowser) do
context 'with custom config' do
before do
allow(Jekyll::Algolia::Configurator)
allow(configurator)
.to receive(:algolia)
allow(configurator)
.to receive(:algolia)
.with('extensions_to_index')
.and_return('html,dhtml')

View File

@ -74,115 +74,6 @@ describe(Jekyll::Algolia::Indexer) do
it { should eq 'custom_index' }
end
describe 'index?' do
subject { current.index?('foo') }
let(:index) { double('Algolia::Index', get_settings: nil) }
before do
expect(current)
.to receive(:index)
.and_return(index)
end
it { should eq true }
context 'when no settings' do
before do
expect(index).to receive(:get_settings).and_raise
end
it { should eq false }
end
end
describe 'update_records' do
let(:index) do
double('Algolia::Index', add_objects!: nil, name: 'my_index')
end
context 'with a small number of records' do
let(:records) { Array.new(10, foo: 'bar') }
before { current.update_records(index, records) }
it do
expect(index)
.to have_received(:add_objects!)
.with(records)
.once
end
end
context 'with a large number of records' do
let(:records) { Array.new(2500, foo: 'bar') }
before { current.update_records(index, records) }
it do
expect(index)
.to have_received(:add_objects!)
.exactly(3).times
end
end
context 'with a custom batch size' do
let(:records) { Array.new(2500, foo: 'bar') }
before do
allow(configurator)
.to receive(:algolia)
.with('indexing_batch_size')
.and_return(500)
end
before { current.update_records(index, records) }
it do
expect(index)
.to have_received(:add_objects!)
.exactly(5).times
end
end
context 'when running a dry run' do
let(:dry_run) { true }
let(:records) { Array.new(10, foo: 'bar') }
it do
expect(index)
.to_not have_received(:add_objects!)
.with(records)
end
end
end
describe '.delete_records_by_id' do
let(:index) do
double('Algolia::Index', delete_objects!: nil, name: 'my_index')
end
let(:ids) { %w[foo bar baz] }
before { current.delete_records_by_id(index, ids) }
it do
expect(index)
.to have_received(:delete_objects!)
.with(ids)
end
context 'when running a dry run' do
let(:dry_run) { true }
it do
expect(index)
.to_not have_received(:delete_objects!)
.with(ids)
end
end
context 'when deleting zero records' do
let(:ids) { [] }
before do
allow(logger).to receive(:log)
end
it do
expect(logger).to_not have_received(:log)
expect(index).to_not have_received(:delete_objects!)
end
end
end
describe '.remote_object_ids' do
subject { current.remote_object_ids(index) }
@ -225,109 +116,6 @@ describe(Jekyll::Algolia::Indexer) do
end
end
describe '.run_diff_mode' do
let(:local_records) do
[
{ objectID: 'foo' },
{ objectID: 'bar' }
]
end
let(:remote_ids) { %w[foo baz] }
before do
allow(current)
.to receive(:index)
.and_return(
double('Algolia::Index', new: 'my_index', name: 'my_index')
)
allow(current).to receive(:remote_object_ids).and_return(remote_ids)
allow(current).to receive(:delete_records_by_id)
allow(current).to receive(:update_records)
allow(current).to receive(:update_settings)
allow(configurator).to receive(:settings).and_return('my_settings')
end
before { current.run_diff_mode(local_records) }
it do
expect(current)
.to have_received(:delete_records_by_id)
.with(anything, ['baz'])
expect(current)
.to have_received(:update_records)
.with(anything, [{ objectID: 'bar' }])
expect(current)
.to have_received(:update_settings)
.with(anything, 'my_settings')
end
context 'nothing changed since last update' do
let(:local_records) do
[
{ objectID: 'foo' },
{ objectID: 'bar' }
]
end
let(:remote_ids) { %w[foo bar] }
before do
allow(logger).to receive(:log)
end
it do
expect(logger).to have_received(:log).with(/Nothing to index/)
end
end
end
describe '.rename_index' do
before { allow(::Algolia).to receive(:move_index!) }
before { current.rename_index('foo', 'bar') }
it do
expect(::Algolia).to have_received(:move_index!).with('foo', 'bar')
end
context 'when running a dry run' do
let(:dry_run) { true }
it do
expect(::Algolia)
.to_not have_received(:move_index!)
end
end
end
describe '.copy_index' do
let(:index_exists) { true }
before do
allow(current).to receive(:index?).and_return(index_exists)
allow(::Algolia).to receive(:copy_index!)
current.copy_index('foo', 'bar')
end
it do
expect(::Algolia).to have_received(:copy_index!).with('foo', 'bar')
end
context 'when no source index' do
let(:index_exists) { false }
it do
expect(::Algolia)
.to_not have_received(:copy_index!)
end
end
context 'when running a dry run' do
let(:dry_run) { true }
it do
expect(::Algolia)
.to_not have_received(:copy_index!)
end
end
end
describe '.update_settings' do
let(:index) { double('Algolia::Index', set_settings!: nil) }
let(:settings) { { 'foo' => 'bar' } }
@ -347,108 +135,109 @@ describe(Jekyll::Algolia::Indexer) do
end
end
describe '.remote_settings' do
subject { current.remote_settings(index) }
describe '.update_records' do
let(:index_name) { 'my_index' }
let(:old_records_ids) { %w[abc] }
let(:new_records) { [{ 'objectID' => 'def' }] }
let(:indexing_batch_size) { 1000 }
let(:index) { double('Algolia::Index').as_null_object }
before { allow(::Algolia).to receive(:batch!) }
before do
expect(index)
.to receive(:get_settings)
.and_return('custom_settings')
allow(configurator)
.to receive(:algolia)
.with('indexing_batch_size')
.and_return(indexing_batch_size)
end
before { current.update_records(index_name, old_records_ids, new_records) }
it { should eq 'custom_settings' }
end
describe '.run_atomic_mode' do
let(:local_records) do
[
{ objectID: 'foo' },
{ objectID: 'bar' }
]
end
let(:remote_ids) { %w[foo baz] }
let(:index) { double('Algolia::Index', new: 'my_index', name: 'my_index') }
let(:index_tmp) do
double('Algolia::Index', new: 'my_index_tmp', name: 'my_index_tmp')
end
before do
allow(configurator).to receive(:index_name).and_return('my_index')
allow(configurator).to receive(:settings).and_return('settings')
allow(current).to receive(:index).with('my_index').and_return(index)
allow(current)
.to receive(:index).with('my_index_tmp').and_return(index_tmp)
allow(current).to receive(:remote_object_ids).and_return(remote_ids)
allow(current).to receive(:copy_index)
allow(current).to receive(:update_settings)
allow(current).to receive(:delete_records_by_id)
allow(current).to receive(:update_records)
allow(current).to receive(:rename_index)
end
before { current.run_atomic_mode(local_records) }
it do
expect(current)
.to have_received(:copy_index)
.with('my_index', 'my_index_tmp')
expect(current)
.to have_received(:update_settings)
.with(index_tmp, 'settings')
expect(current)
.to have_received(:delete_records_by_id)
.with(index_tmp, ['baz'])
expect(current)
.to have_received(:update_records)
.with(index_tmp, [{ objectID: 'bar' }])
expect(current)
.to have_received(:rename_index)
.with('my_index_tmp', 'my_index')
end
context 'nothing changed since last update' do
let(:local_records) do
[
{ objectID: 'foo' },
{ objectID: 'bar' }
]
end
let(:remote_ids) { %w[foo bar] }
before do
allow(logger).to receive(:log)
end
context 'when running a dry run' do
let(:dry_run) { true }
it do
expect(logger).to have_received(:log).with(/Nothing to index/)
expect(::Algolia)
.to_not have_received(:batch!)
end
end
it 'should batch all operations' do
expect(::Algolia)
.to have_received(:batch!)
.with([
{
action: 'deleteObject',
indexName: 'my_index',
body: { objectID: 'abc' }
},
{
action: 'addObject',
indexName: 'my_index',
body: { 'objectID' => 'def' }
}
])
end
context 'split in smaller batches if too many operations' do
let(:indexing_batch_size) { 1 }
it do
expect(::Algolia)
.to have_received(:batch!)
.ordered
.with([
{
action: 'deleteObject',
indexName: 'my_index',
body: { objectID: 'abc' }
}
])
expect(::Algolia)
.to have_received(:batch!)
.ordered
.with([
{
action: 'addObject',
indexName: 'my_index',
body: { 'objectID' => 'def' }
}
])
end
end
end
describe '.run' do
let(:indexing_mode) { 'diff' }
let(:records) { [{ objectID: 'foo' }, { objectID: 'bar' }] }
let(:remote_ids) { %w[foo baz] }
let(:settings) { 'settings' }
let(:index_name) { 'my_index' }
before do
allow(configurator).to receive(:settings).and_return(settings)
allow(configurator).to receive(:index_name).and_return(index_name)
allow(current).to receive(:init)
allow(current).to receive(:run_diff_mode)
allow(current).to receive(:run_atomic_mode)
allow(configurator).to receive(:indexing_mode).and_return(indexing_mode)
allow(current).to receive(:index).and_return('my_index')
allow(current).to receive(:update_settings)
allow(current).to receive(:remote_object_ids).and_return(remote_ids)
allow(current).to receive(:update_records)
end
context 'with records' do
let(:records) { [{ 'objectID' => 'foo' }, { 'objectID' => 'bar' }] }
before { current.run(records) }
it { expect(current).to have_received(:init) }
context 'when in diff mode' do
let(:indexing_mode) { 'diff' }
it { expect(current).to have_received(:run_diff_mode) }
it { expect(current).to_not have_received(:run_atomic_mode) }
it do
expect(current)
.to have_received(:update_settings)
.with('my_index', settings)
end
context 'when in atomic mode' do
let(:indexing_mode) { 'atomic' }
it { expect(current).to have_received(:run_atomic_mode) }
it { expect(current).to_not have_received(:run_diff_mode) }
it do
expect(current)
.to have_received(:update_records)
.with(index_name, ['baz'], [{ objectID: 'bar' }])
end
context 'when nothing changed' do
let(:remote_ids) { %w[foo bar] }
it do
expect(current)
.to_not have_received(:update_records)
end
end
end