jekyll-algolia/lib/record_extractor.rb

require 'algoliasearch'
require 'nokogiri'
require 'json'
require 'html-hierarchy-extractor'
require_relative './utils'

# Given an HTML file as input, will return an array of records to index
class AlgoliaSearchRecordExtractor
  attr_reader :file

  def initialize(file)
    @file = file
    @config = file.site.config
    default_config = {
      'record_css_selector' => 'p'
    }
    @config = default_config.merge(file.site.config['algolia'])
  end

  # Hook to modify a record after extracting
  def custom_hook_each(item, _node)
    item
  end

  # Hook to modify all records after extracting
  def custom_hook_all(items)
    items
  end

  ##
  # Return the type of the Jekyll element
  # It can be either page, post or document
  def type
    classname = @file.class.name
    subclass = classname.split('::')[1]
    type = subclass.downcase

    # In Jekyll v2, Page, Post and Document have their own class
    return type if AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')

    # In Jekyll v3, Post are actually a specific type of Documents
    if type == 'document'
      collection_name = @file.collection.instance_variable_get('@label')
      return 'post' if collection_name == 'posts'
    end

    type
  end

  ##
  # Return the url of the page
  def url
    @file.url
  end

  ##
  # Return the title of the page
  def title
    @file.data['title']
  end

  ##
  # Returns the slug of the document
  def slug
    # We can guess the slug from the filename for all documents
    basename = File.basename(@file.path)
    extname = File.extname(basename)
    slug = File.basename(basename, extname)

    # Jekyll v2 posts have a specific slug method
    return @file.slug if @file.respond_to? :slug

    # Jekyll v3 posts have it in data
    return @file.data['slug'] if @file.data.key?('slug')

    slug
  end

  ##
  # Get an array of tags of the document
  def tags
    tags = []

    # Jekyll v2 posts have a specific tags methods
    tags = @file.tags if @file.respond_to?(:tags)

    # Others have it in data
    tags = @file.data['tags'] if tags.empty? && @file.data.key?('tags')

    # Some extension extends the tags with custom classes, so we make sure we
    # cast them as strings
    tags.map(&:to_s)
  end

  ##
  # Get the post date timestamp
  def date
    return nil unless @file.respond_to?(:date)

    @file.date.to_time.to_i
  end

  ##
  # Get a hash of all front-matter data
  def front_matter
    raw_data = @file.data

    # We clean some keys that will be handled by specific methods
    attributes_to_remove = %w(title tags slug url date type)
    attributes_to_remove.each do |attribute|
      raw_data.delete(attribute)
    end

    # Convert to symbols
    data = {}
    raw_data.each do |key, value|
      data[key.to_sym] = value
    end

    data
  end

  ##
  # Get the list of all node data
  def hierarchy_nodes
    extractor_options = {
      css_selector: @config['record_css_selector']
    }

    HTMLHierarchyExtractor.new(
      @file.content,
      options: extractor_options
    ).extract
  end

  # Extract all records from the page and return the list
  def extract
    # Getting all hierarchical nodes from the HTML input
    raw_items = hierarchy_nodes

    # Shared attributes relative to the page that all records will have
    shared_attributes = {
      type: type,
      url: url,
      title: title,
      slug: slug,
      date: date,
      tags: tags
    }

    # Enriching with page metadata
    items = []
    raw_items.each do |raw_item|
      nokogiri_node = raw_item[:node]
      raw_item.delete(:node)
      item = shared_attributes.merge(raw_item)

      item = custom_hook_each(item, nokogiri_node)
      next if item.nil?

      items << item
    end

    custom_hook_all(items)
  end
end