167 lines
3.7 KiB
Ruby
167 lines
3.7 KiB
Ruby
require 'algoliasearch'
|
|
require 'nokogiri'
|
|
require 'json'
|
|
require 'html-hierarchy-extractor'
|
|
require_relative './utils'
|
|
|
|
# Given an HTML file as input, will return an array of records to index
|
|
class AlgoliaSearchRecordExtractor
|
|
attr_reader :file
|
|
|
|
def initialize(file)
|
|
@file = file
|
|
@config = file.site.config
|
|
default_config = {
|
|
'record_css_selector' => 'p'
|
|
}
|
|
@config = default_config.merge(file.site.config['algolia'])
|
|
end
|
|
|
|
# Hook to modify a record after extracting
|
|
def custom_hook_each(item, _node)
|
|
item
|
|
end
|
|
|
|
# Hook to modify all records after extracting
|
|
def custom_hook_all(items)
|
|
items
|
|
end
|
|
|
|
##
|
|
# Return the type of the Jekyll element
|
|
# It can be either page, post or document
|
|
def type
|
|
classname = @file.class.name
|
|
subclass = classname.split('::')[1]
|
|
type = subclass.downcase
|
|
|
|
# In Jekyll v2, Page, Post and Document have their own class
|
|
return type if AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
|
|
|
|
# In Jekyll v3, Post are actually a specific type of Documents
|
|
if type == 'document'
|
|
collection_name = @file.collection.instance_variable_get('@label')
|
|
return 'post' if collection_name == 'posts'
|
|
end
|
|
|
|
type
|
|
end
|
|
|
|
##
|
|
# Return the url of the page
|
|
def url
|
|
@file.url
|
|
end
|
|
|
|
##
|
|
# Return the title of the page
|
|
def title
|
|
@file.data['title']
|
|
end
|
|
|
|
##
|
|
# Returns the slug of the document
|
|
def slug
|
|
# We can guess the slug from the filename for all documents
|
|
basename = File.basename(@file.path)
|
|
extname = File.extname(basename)
|
|
slug = File.basename(basename, extname)
|
|
|
|
# Jekyll v2 posts have a specific slug method
|
|
return @file.slug if @file.respond_to? :slug
|
|
|
|
# Jekyll v3 posts have it in data
|
|
return @file.data['slug'] if @file.data.key?('slug')
|
|
|
|
slug
|
|
end
|
|
|
|
##
|
|
# Get an array of tags of the document
|
|
def tags
|
|
tags = []
|
|
|
|
# Jekyll v2 posts have a specific tags methods
|
|
tags = @file.tags if @file.respond_to?(:tags)
|
|
|
|
# Others have it in data
|
|
tags = @file.data['tags'] if tags.empty? && @file.data.key?('tags')
|
|
|
|
# Some extension extends the tags with custom classes, so we make sure we
|
|
# cast them as strings
|
|
tags.map(&:to_s)
|
|
end
|
|
|
|
##
|
|
# Get the post date timestamp
|
|
def date
|
|
return nil unless @file.respond_to?(:date)
|
|
|
|
@file.date.to_time.to_i
|
|
end
|
|
|
|
##
|
|
# Get a hash of all front-matter data
|
|
def front_matter
|
|
raw_data = @file.data
|
|
|
|
# We clean some keys that will be handled by specific methods
|
|
attributes_to_remove = %w(title tags slug url date type)
|
|
attributes_to_remove.each do |attribute|
|
|
raw_data.delete(attribute)
|
|
end
|
|
|
|
# Convert to symbols
|
|
data = {}
|
|
raw_data.each do |key, value|
|
|
data[key.to_sym] = value
|
|
end
|
|
|
|
data
|
|
end
|
|
|
|
##
|
|
# Get the list of all node data
|
|
def hierarchy_nodes
|
|
extractor_options = {
|
|
css_selector: @config['record_css_selector']
|
|
}
|
|
|
|
HTMLHierarchyExtractor.new(
|
|
@file.content,
|
|
options: extractor_options
|
|
).extract
|
|
end
|
|
|
|
# Extract all records from the page and return the list
|
|
def extract
|
|
# Getting all hierarchical nodes from the HTML input
|
|
raw_items = hierarchy_nodes
|
|
|
|
# Shared attributes relative to the page that all records will have
|
|
shared_attributes = {
|
|
type: type,
|
|
url: url,
|
|
title: title,
|
|
slug: slug,
|
|
date: date,
|
|
tags: tags
|
|
}
|
|
|
|
# Enriching with page metadata
|
|
items = []
|
|
raw_items.each do |raw_item|
|
|
nokogiri_node = raw_item[:node]
|
|
raw_item.delete(:node)
|
|
item = shared_attributes.merge(raw_item)
|
|
|
|
item = custom_hook_each(item, nokogiri_node)
|
|
next if item.nil?
|
|
|
|
items << item
|
|
end
|
|
|
|
custom_hook_all(items)
|
|
end
|
|
end
|