require 'algoliasearch' require 'json' require 'nokogiri' require_relative './version' require_relative './record_extractor' require_relative './credential_checker' require_relative './error_handler' # `jekyll algolia push` main command class AlgoliaSearchJekyllPush < Jekyll::Command class << self attr_accessor :options, :config def init_with_program(_prog) end # Init the command with options passed on the command line # `jekyll algolia push ARG1 ARG2 --OPTION_NAME1 OPTION_VALUE1` # config comes from _config.yml def init_options(args = [], options = {}, config = {}) args = [] unless args @args = args @options = options @config = config @is_verbose = @config['verbose'] @is_dry_run = @config['dry_run'] self end # Check if the specified file should be indexed (we exclude static files, # robots.txt and custom defined exclusions). def indexable?(file) # Excluding all static assets (images, fonts, etc) return false if file.is_a?(Jekyll::StaticFile) # Jekyll auto-converts markdown to HTML, so if the file is neither # markdown or HTML, we should probably not index it allowed_extensions = %w(html) if @config['markdown_ext'] allowed_extensions += @config['markdown_ext'].split(',') end extname = File.extname(File.basename(file.path)) return false unless allowed_extensions.include?(extname[1..-1]) # We should not index GitHub pages 404 pages # https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/ basename_no_ext = File.basename(file.path, extname) return false if basename_no_ext == '404' # Users can also define their own blacklist and hooks to exclude files return false if excluded_file?(file) true end # Check if the file is in the list of excluded files def excluded_file?(file) # Blacklist of pages generated by Jekyll that we know should not be # indexed excluded = [ %r{^page([0-9]*)/index\.html} # Pagination pages ] # User-provided blacklist if @config['algolia'] excluded += (@config['algolia']['excluded_files'] || []) end excluded.each do |pattern| pattern = /#{Regexp.quote(pattern)}/ if pattern.is_a? String return true if file.path =~ pattern end # Call user custom exclude hook on remaining files return true if custom_hook_excluded_file?(file) false end # User custom method to exclude some files when algolia.excluded_files is # not enough def custom_hook_excluded_file?(_file) false end # Return a patched version of a Jekyll instance def jekyll_new(config) site = Jekyll::Site.new(config) # Patched version of `write` that will push to Algolia instead of writing # on disk def site.write items = [] is_verbose = config['verbose'] each_site_file do |file| # Skip files that should not be indexed next unless AlgoliaSearchJekyllPush.indexable?(file) Jekyll.logger.info "Extracting data from #{file.path}" if is_verbose new_items = AlgoliaSearchRecordExtractor.new(file).extract next if new_items.nil? ap new_items if is_verbose items += new_items end AlgoliaSearchJekyllPush.push(items) end site end # Get index settings def configure_index(index) settings = { distinct: true, attributeForDistinct: 'url', attributesForFaceting: %w(tags type title), attributesToIndex: %w( title h1 h2 h3 h4 h5 h6 unordered(text) unordered(tags) ), attributesToRetrieve: nil, customRanking: [ 'desc(posted_at)', 'desc(weight.tag_name)', 'asc(weight.position)' ], highlightPreTag: '', highlightPostTag: '' } # Merge default settings with user custom ones if @config['algolia'] (@config['algolia']['settings'] || []).each do |key, value| settings[key.to_sym] = value end end begin index.set_settings(settings) rescue StandardError => error display_error(error) exit 1 end end # Display the error in a human-friendly way if possible def display_error(error) error_handler = AlgoliaSearchErrorHandler.new readable_error = error_handler.readable_algolia_error(error.message) if readable_error error_handler.display(readable_error) else Jekyll.logger.error 'Algolia Error: HTTP Error' Jekyll.logger.warn error.message end end # Change the User-Agent header to isolate calls from this plugin def set_user_agent_header version = AlgoliaSearchJekyllVersion.to_s Algolia.set_extra_header('User-Agent', "Algolia for Jekyll #{version}") end # Create an index to push our data def create_index(index_name) set_user_agent_header index = Algolia::Index.new(index_name) configure_index(index) unless @is_dry_run index end # Push records to the index def batch_add_items(items, index) items.each_slice(1000) do |batch| Jekyll.logger.info "Indexing #{batch.size} items" begin index.add_objects!(batch) unless @is_dry_run rescue StandardError => error display_error(error) exit 1 end end end def push(items) AlgoliaSearchCredentialChecker.new(@config).assert_valid Jekyll.logger.info '=== DRY RUN ===' if @is_dry_run # Add items to a temp index, then rename it index_name = @config['algolia']['index_name'] index_name_tmp = "#{index_name}_tmp" batch_add_items(items, create_index(index_name_tmp)) Algolia.move_index(index_name_tmp, index_name) unless @is_dry_run Jekyll.logger.info "Indexing of #{items.size} items " \ "in #{index_name} done." end end end