jekyll-algolia/lib/record_extractor.rb

require 'algoliasearch'
require 'nokogiri'
require 'json'

# Given an HTML file as input, will return an array of records to index
class AlgoliaSearchRecordExtractor
  def initialize(file)
    @file = file
    default_config = {
      'record_css_selector' => 'p'
    }
    @config = default_config.merge(file.site.config['algolia'])
  end

  # Hook to modify a record after extracting
  def custom_hook_each(item)
    item
  end

  # Hook to modify all records after extracting
  def custom_hook_all(items)
    items
  end

  # Returns metadata from the current file
  def metadata
    return metadata_page if @file.is_a?(Jekyll::Page)
    return metadata_post if @file.is_a?(Jekyll::Post)
    {}
  end

  # Extract a list of tags
  def tags
    return nil unless @file.respond_to? :tags
    # Some plugins will extend the tags from simple strings to full featured
    # objects. We'll simply call .to_s to always have a string
    @file.tags.map(&:to_s)
  end

  # Extract metadata from a post
  def metadata_post
    {
      type: 'post',
      url: @file.url,
      title: @file.title,
      slug: @file.slug,
      posted_at: @file.date.to_time.to_i,
      tags: tags
    }
  end

  # Extract metadata from a page
  def metadata_page
    {
      type: 'page',
      url: @file.url,
      title: @file['title'],
      slug: @file.basename
    }
  end

  # Get the list of all HTML nodes to index
  def html_nodes
    document = Nokogiri::HTML(@file.content)
    document.css(@config['record_css_selector'])
  end

  # Get the closest heading parent
  def node_heading_parent(node)
    previous = node.previous_element

    # No previous element, we go up to the parent
    unless previous
      parent = node.parent
      # No more parent, then no heading found
      return nil if parent.name == 'body'
      return node_heading_parent(parent)
    end

    # This is a heading, we return it
    return previous if %w(h1 h2 h3 h4 h5 h6).include?(previous.name)

    node_heading_parent(previous)
  end

  # Get all the parent headings of the specified node
  def node_hierarchy(node, memo = { level: 7 })
    previous = node_heading_parent(node)

    # No previous heading, we can stop the recursion
    unless previous
      memo.delete(:level)
      return memo
    end

    tag_name = previous.name
    level = tag_name.gsub('h', '').to_i
    content = previous.content

    # Skip if item already as title of a higher level
    return node_hierarchy(previous, memo) if level >= memo[:level]
    memo[:level] = level

    # Add to the memo and continue
    memo[tag_name.to_sym] = content
    node_hierarchy(previous, memo)


    # # This will actually create a hash with all the h1, h2, etc to find the
    # # specified node
    # previous = node.previous_element

    # # No previous element, we go up to the parent
    # unless previous
    #   parent = node.parent
    #   # No more parent, ending recursion
    #   if parent.name == 'body'
    #   end
    #   # We start from the previous sibling of the parent
    #   return node_hierarchy(parent, memo)
    # end

    # # Skip non-title elements
    # tag_name = previous.name
    # unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name)
    #   return node_hierarchy(previous, memo)
    # end

    # # Skip if item already as title of a higher level
    # title_level = tag_name.gsub('h', '').to_i
    # return node_hierarchy(previous, memo) if title_level >= memo[:level]
    # memo[:level] = title_level

    # # Add to the memo and continue
    # memo[tag_name.to_sym] = previous.content
    # node_hierarchy(previous, memo)
  end

  # Return the raw HTML of the element to index
  def node_raw_html(node)
    node.to_s
  end

  # Return the text of the element, sanitized to be displayed
  def node_text(node)
    node.content.gsub('<', '&lt;').gsub('>', '&gt;')
  end

  # Returns a unique string of hierarchy from title to h6, used for distinct
  def unique_hierarchy(data)
    headings = %w(title h1 h2 h3 h4 h5 h6)
    headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
  end

  # Returns a hash of two CSS selectors. One for the node itself, and one its
  # closest heading parent
  def node_css_selector(node)
    return nil if node.nil?

    # Use the CSS id if one is set
    return "##{node['id']}" if node['id']

    # Default Nokogiri selector
    node.css_path.gsub('html > body > ', '')
  end

  # Returns a custom numeric value representing how relevant to its hierarchy
  # this record is. This value can be used in the custom ranking to display more
  # relevant records first.
  def weight(data)
    # Get list of unique words in headings
    title_words = %i(title h1 h2 h3 h4 h5 h6)
                  .select { |title| data.key?(title) }
                  .map { |title| data[title].split(/\W+/) }
                  .flatten
                  .compact
                  .map(&:downcase)
                  .uniq
    # Intersect words in headings with words in test
    text_words = data[:text].downcase.split(/\W+/)
    (title_words & text_words).size
  end

  def extract
    items = []
    html_nodes.each_with_index do |node, index|
      next unless node.text.size > 0

      item = metadata.clone
      item[:objectID] = "#{item[:slug]}_#{index}"
      item.merge!(node_hierarchy(node))
      item[:tag_name] = node.name
      item[:raw_html] = node_raw_html(node)
      item[:text] = node_text(node)
      item[:unique_hierarchy] = unique_hierarchy(item)
      item[:css_selector] = node_css_selector(node)
      item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
      item[:weight] = weight(item)

      item = custom_hook_each(item)

      items << item
    end
    custom_hook_all(items)
  end
end