207 lines
5.6 KiB
Ruby
207 lines
5.6 KiB
Ruby
require 'algoliasearch'
|
|
require 'nokogiri'
|
|
require 'json'
|
|
|
|
# Given an HTML file as input, will return an array of records to index
|
|
class AlgoliaSearchRecordExtractor
|
|
def initialize(file)
|
|
@file = file
|
|
default_config = {
|
|
'record_css_selector' => 'p'
|
|
}
|
|
@config = default_config.merge(file.site.config['algolia'])
|
|
end
|
|
|
|
# Hook to modify a record after extracting
|
|
def custom_hook_each(item)
|
|
item
|
|
end
|
|
|
|
# Hook to modify all records after extracting
|
|
def custom_hook_all(items)
|
|
items
|
|
end
|
|
|
|
# Returns metadata from the current file
|
|
def metadata
|
|
return metadata_page if @file.is_a?(Jekyll::Page)
|
|
return metadata_post if @file.is_a?(Jekyll::Post)
|
|
{}
|
|
end
|
|
|
|
# Extract a list of tags
|
|
def tags
|
|
return nil unless @file.respond_to? :tags
|
|
# Some plugins will extend the tags from simple strings to full featured
|
|
# objects. We'll simply call .to_s to always have a string
|
|
@file.tags.map(&:to_s)
|
|
end
|
|
|
|
# Extract metadata from a post
|
|
def metadata_post
|
|
{
|
|
type: 'post',
|
|
url: @file.url,
|
|
title: @file.title,
|
|
slug: @file.slug,
|
|
posted_at: @file.date.to_time.to_i,
|
|
tags: tags
|
|
}
|
|
end
|
|
|
|
# Extract metadata from a page
|
|
def metadata_page
|
|
{
|
|
type: 'page',
|
|
url: @file.url,
|
|
title: @file['title'],
|
|
slug: @file.basename
|
|
}
|
|
end
|
|
|
|
# Get the list of all HTML nodes to index
|
|
def html_nodes
|
|
document = Nokogiri::HTML(@file.content)
|
|
document.css(@config['record_css_selector'])
|
|
end
|
|
|
|
# Get the closest heading parent
|
|
def node_heading_parent(node)
|
|
previous = node.previous_element
|
|
|
|
# No previous element, we go up to the parent
|
|
unless previous
|
|
parent = node.parent
|
|
# No more parent, then no heading found
|
|
return nil if parent.name == 'body'
|
|
return node_heading_parent(parent)
|
|
end
|
|
|
|
# This is a heading, we return it
|
|
return previous if %w(h1 h2 h3 h4 h5 h6).include?(previous.name)
|
|
|
|
node_heading_parent(previous)
|
|
end
|
|
|
|
# Get all the parent headings of the specified node
|
|
def node_hierarchy(node, memo = { level: 7 })
|
|
previous = node_heading_parent(node)
|
|
|
|
# No previous heading, we can stop the recursion
|
|
unless previous
|
|
memo.delete(:level)
|
|
return memo
|
|
end
|
|
|
|
tag_name = previous.name
|
|
level = tag_name.gsub('h', '').to_i
|
|
content = previous.content
|
|
|
|
# Skip if item already as title of a higher level
|
|
return node_hierarchy(previous, memo) if level >= memo[:level]
|
|
memo[:level] = level
|
|
|
|
# Add to the memo and continue
|
|
memo[tag_name.to_sym] = content
|
|
node_hierarchy(previous, memo)
|
|
|
|
|
|
# # This will actually create a hash with all the h1, h2, etc to find the
|
|
# # specified node
|
|
# previous = node.previous_element
|
|
|
|
# # No previous element, we go up to the parent
|
|
# unless previous
|
|
# parent = node.parent
|
|
# # No more parent, ending recursion
|
|
# if parent.name == 'body'
|
|
# end
|
|
# # We start from the previous sibling of the parent
|
|
# return node_hierarchy(parent, memo)
|
|
# end
|
|
|
|
# # Skip non-title elements
|
|
# tag_name = previous.name
|
|
# unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name)
|
|
# return node_hierarchy(previous, memo)
|
|
# end
|
|
|
|
# # Skip if item already as title of a higher level
|
|
# title_level = tag_name.gsub('h', '').to_i
|
|
# return node_hierarchy(previous, memo) if title_level >= memo[:level]
|
|
# memo[:level] = title_level
|
|
|
|
# # Add to the memo and continue
|
|
# memo[tag_name.to_sym] = previous.content
|
|
# node_hierarchy(previous, memo)
|
|
end
|
|
|
|
# Return the raw HTML of the element to index
|
|
def node_raw_html(node)
|
|
node.to_s
|
|
end
|
|
|
|
# Return the text of the element, sanitized to be displayed
|
|
def node_text(node)
|
|
node.content.gsub('<', '<').gsub('>', '>')
|
|
end
|
|
|
|
# Returns a unique string of hierarchy from title to h6, used for distinct
|
|
def unique_hierarchy(data)
|
|
headings = %w(title h1 h2 h3 h4 h5 h6)
|
|
headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
|
|
end
|
|
|
|
# Returns a hash of two CSS selectors. One for the node itself, and one its
|
|
# closest heading parent
|
|
def node_css_selector(node)
|
|
return nil if node.nil?
|
|
|
|
# Use the CSS id if one is set
|
|
return "##{node['id']}" if node['id']
|
|
|
|
# Default Nokogiri selector
|
|
node.css_path.gsub('html > body > ', '')
|
|
end
|
|
|
|
# Returns a custom numeric value representing how relevant to its hierarchy
|
|
# this record is. This value can be used in the custom ranking to display more
|
|
# relevant records first.
|
|
def weight(data)
|
|
# Get list of unique words in headings
|
|
title_words = %i(title h1 h2 h3 h4 h5 h6)
|
|
.select { |title| data.key?(title) }
|
|
.map { |title| data[title].split(/\W+/) }
|
|
.flatten
|
|
.compact
|
|
.map(&:downcase)
|
|
.uniq
|
|
# Intersect words in headings with words in test
|
|
text_words = data[:text].downcase.split(/\W+/)
|
|
(title_words & text_words).size
|
|
end
|
|
|
|
def extract
|
|
items = []
|
|
html_nodes.each_with_index do |node, index|
|
|
next unless node.text.size > 0
|
|
|
|
item = metadata.clone
|
|
item[:objectID] = "#{item[:slug]}_#{index}"
|
|
item.merge!(node_hierarchy(node))
|
|
item[:tag_name] = node.name
|
|
item[:raw_html] = node_raw_html(node)
|
|
item[:text] = node_text(node)
|
|
item[:unique_hierarchy] = unique_hierarchy(item)
|
|
item[:css_selector] = node_css_selector(node)
|
|
item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
|
|
item[:weight] = weight(item)
|
|
|
|
item = custom_hook_each(item)
|
|
|
|
items << item
|
|
end
|
|
custom_hook_all(items)
|
|
end
|
|
end
|