jekyll-algolia/lib/record_extractor.rb
2016-06-28 15:42:14 +02:00

167 lines
3.7 KiB
Ruby

require 'algoliasearch'
require 'nokogiri'
require 'json'
require 'html-hierarchy-extractor'
require_relative './utils'
# Given an HTML file as input, will return an array of records to index
class AlgoliaSearchRecordExtractor
attr_reader :file
def initialize(file)
@file = file
@config = file.site.config
default_config = {
'record_css_selector' => 'p'
}
@config = default_config.merge(file.site.config['algolia'])
end
# Hook to modify a record after extracting
def custom_hook_each(item, _node)
item
end
# Hook to modify all records after extracting
def custom_hook_all(items)
items
end
##
# Return the type of the Jekyll element
# It can be either page, post or document
def type
classname = @file.class.name
subclass = classname.split('::')[1]
type = subclass.downcase
# In Jekyll v2, Page, Post and Document have their own class
return type if AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
# In Jekyll v3, Post are actually a specific type of Documents
if type == 'document'
collection_name = @file.collection.instance_variable_get('@label')
return 'post' if collection_name == 'posts'
end
type
end
##
# Return the url of the page
def url
@file.url
end
##
# Return the title of the page
def title
@file.data['title']
end
##
# Returns the slug of the document
def slug
# We can guess the slug from the filename for all documents
basename = File.basename(@file.path)
extname = File.extname(basename)
slug = File.basename(basename, extname)
# Jekyll v2 posts have a specific slug method
return @file.slug if @file.respond_to? :slug
# Jekyll v3 posts have it in data
return @file.data['slug'] if @file.data.key?('slug')
slug
end
##
# Get an array of tags of the document
def tags
tags = []
# Jekyll v2 posts have a specific tags methods
tags = @file.tags if @file.respond_to?(:tags)
# Others have it in data
tags = @file.data['tags'] if tags.empty? && @file.data.key?('tags')
# Some extension extends the tags with custom classes, so we make sure we
# cast them as strings
tags.map(&:to_s)
end
##
# Get the post date timestamp
def date
return nil unless @file.respond_to?(:date)
@file.date.to_time.to_i
end
##
# Get a hash of all front-matter data
def front_matter
raw_data = @file.data
# We clean some keys that will be handled by specific methods
attributes_to_remove = %w(title tags slug url date type)
attributes_to_remove.each do |attribute|
raw_data.delete(attribute)
end
# Convert to symbols
data = {}
raw_data.each do |key, value|
data[key.to_sym] = value
end
data
end
##
# Get the list of all node data
def hierarchy_nodes
extractor_options = {
css_selector: @config['record_css_selector']
}
HTMLHierarchyExtractor.new(
@file.content,
options: extractor_options
).extract
end
# Extract all records from the page and return the list
def extract
# Getting all hierarchical nodes from the HTML input
raw_items = hierarchy_nodes
# Shared attributes relative to the page that all records will have
shared_attributes = {
type: type,
url: url,
title: title,
slug: slug,
date: date,
tags: tags
}
# Enriching with page metadata
items = []
raw_items.each do |raw_item|
nokogiri_node = raw_item[:node]
raw_item.delete(:node)
item = shared_attributes.merge(raw_item)
item = custom_hook_each(item, nokogiri_node)
next if item.nil?
items << item
end
custom_hook_all(items)
end
end