Was previously only indexing headings in the same parent. Will now go up the DOM tree until reaching the body tag.
263 lines
8.1 KiB
Ruby
263 lines
8.1 KiB
Ruby
require 'algoliasearch'
|
|
require 'nokogiri'
|
|
require 'json'
|
|
|
|
# `jekyll algolia push` command
|
|
class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
class << self
|
|
def init_with_program(_prog)
|
|
end
|
|
|
|
def process(args = [], options = {}, config = {})
|
|
@args = args
|
|
@options = options
|
|
@config = config
|
|
|
|
index_name = args[0]
|
|
|
|
@config['algolia']['index_name'] = index_name if index_name
|
|
site = Jekyll::Site.new(@config)
|
|
|
|
# Instead of writing generated website to disk, we will push it to the
|
|
# index
|
|
def site.write
|
|
items = []
|
|
each_site_file do |file|
|
|
new_items = AlgoliaSearchJekyllPush.get_items_from_file(file)
|
|
next if new_items.nil?
|
|
items += new_items
|
|
end
|
|
AlgoliaSearchJekyllPush.push(items)
|
|
end
|
|
|
|
site.process
|
|
end
|
|
|
|
def parseable?(file)
|
|
ext = file.ext.delete('.')
|
|
# Allow markdown and html pages
|
|
return true if @config['markdown_ext'].split(',').include?(ext)
|
|
return false unless ext == 'html'
|
|
return false unless file['title']
|
|
true
|
|
end
|
|
|
|
def excluded_file?(file)
|
|
@config['algolia']['excluded_files'].include?(file.name)
|
|
end
|
|
|
|
def check_credentials(api_key, application_id, index_name)
|
|
unless api_key
|
|
Jekyll.logger.error 'Algolia Error: No API key defined'
|
|
Jekyll.logger.warn ' You have two ways to configure your API key:'
|
|
Jekyll.logger.warn ' - The ALGOLIA_API_KEY environment variable'
|
|
Jekyll.logger.warn ' - A file named ./_algolia_api_key in your '\
|
|
'source folder'
|
|
exit 1
|
|
end
|
|
|
|
unless application_id
|
|
Jekyll.logger.error 'Algolia Error: No application ID defined'
|
|
Jekyll.logger.warn ' Please set your application id in the '\
|
|
'_config.yml file, like so:'
|
|
puts ''
|
|
# The spaces are needed otherwise the text is centered
|
|
Jekyll.logger.warn ' algolia: '
|
|
Jekyll.logger.warn ' application_id: \'{your_application_id}\''
|
|
puts ''
|
|
Jekyll.logger.warn ' Your application ID can be found in your algolia'\
|
|
' dashboard'
|
|
Jekyll.logger.warn ' https://www.algolia.com/licensing'
|
|
exit 1
|
|
end
|
|
|
|
unless index_name
|
|
Jekyll.logger.error 'Algolia Error: No index name defined'
|
|
Jekyll.logger.warn ' Please set your index name in the _config.yml'\
|
|
' file, like so:'
|
|
puts ''
|
|
# The spaces are needed otherwise the text is centered
|
|
Jekyll.logger.warn ' algolia: '
|
|
Jekyll.logger.warn ' index_name: \'{your_index_name}\''
|
|
puts ''
|
|
Jekyll.logger.warn ' You can edit your indices in your dashboard'
|
|
Jekyll.logger.warn ' https://www.algolia.com/explorer'
|
|
exit 1
|
|
end
|
|
true
|
|
end
|
|
|
|
def configure_index(index)
|
|
default_settings = {
|
|
typoTolerance: true,
|
|
attributeForDistinct: 'parent_id',
|
|
attributesForFaceting: %w(tags type),
|
|
attributesToIndex: %w(
|
|
title h1 h2 h3 h4 h5 h6
|
|
unordered(text)
|
|
unordered(tags)
|
|
),
|
|
attributesToRetrieve: %w(
|
|
title h1 h2 h3 h4 h5 h6
|
|
posted_at
|
|
content
|
|
text
|
|
url
|
|
css_selector
|
|
),
|
|
customRanking: ['desc(posted_at)', 'desc(title_weight)'],
|
|
distinct: true,
|
|
highlightPreTag: '<span class="algolia__result-highlight">',
|
|
highlightPostTag: '</span>'
|
|
}
|
|
custom_settings = {}
|
|
@config['algolia']['settings'].each do |key, value|
|
|
custom_settings[key.to_sym] = value
|
|
end
|
|
settings = default_settings.merge(custom_settings)
|
|
|
|
index.set_settings(settings)
|
|
end
|
|
|
|
def get_items_from_file(file)
|
|
is_page = file.is_a?(Jekyll::Page)
|
|
is_post = file.is_a?(Jekyll::Post)
|
|
|
|
# We only index posts, and markdown pages
|
|
return nil unless is_page || is_post
|
|
return nil if is_page && !parseable?(file)
|
|
return nil if excluded_file?(file)
|
|
|
|
html = file.content.gsub("\n", ' ')
|
|
|
|
if is_post
|
|
tags = get_tags_from_post(file)
|
|
base_data = {
|
|
type: 'post',
|
|
parent_id: file.id,
|
|
url: file.url,
|
|
title: file.title,
|
|
tags: tags,
|
|
slug: file.slug,
|
|
posted_at: file.date.to_time.to_i
|
|
}
|
|
else
|
|
base_data = {
|
|
type: 'page',
|
|
parent_id: file.basename,
|
|
url: file.url,
|
|
title: file['title'],
|
|
slug: file.basename
|
|
}
|
|
end
|
|
|
|
get_paragraphs_from_html(html, base_data)
|
|
end
|
|
|
|
# Get a list of tags from a post. Handle both classic string tags or
|
|
# extended object tags
|
|
def get_tags_from_post(post)
|
|
tags = post.tags
|
|
return [] if tags.is_a?(Array) || tags.nil?
|
|
tags.map! { |tag| tag.to_s.gsub(',', '') }
|
|
end
|
|
|
|
# Get the list of headings (h1, h2, etc) above the specified node
|
|
def get_previous_hx(node, memo = { level: 7 })
|
|
previous = node.previous_element
|
|
# No previous element, we go up to the parent
|
|
unless previous
|
|
parent = node.parent
|
|
# No parent, we stop
|
|
if parent.name == 'body'
|
|
memo.delete(:level)
|
|
return memo
|
|
end
|
|
# We start from the previous sibling of the parent
|
|
return get_previous_hx(parent, memo)
|
|
end
|
|
|
|
# Skip non-title elements
|
|
tag_name = previous.name
|
|
possible_title_elements = %w(h1 h2 h3 h4 h5 h6)
|
|
unless possible_title_elements.include?(tag_name)
|
|
return get_previous_hx(previous, memo)
|
|
end
|
|
|
|
# Skip if item already as title of a higher level
|
|
title_level = tag_name.gsub('h', '').to_i
|
|
return get_previous_hx(previous, memo) if title_level >= memo[:level]
|
|
memo[:level] = title_level
|
|
|
|
# Add to the memo and continue
|
|
memo[tag_name.to_sym] = previous.content
|
|
get_previous_hx(previous, memo)
|
|
end
|
|
|
|
# Get a custom value representing the number of word occurence from the
|
|
# titles into the content
|
|
def get_title_weight(content, item)
|
|
# Get list of words
|
|
words = %i(title h1 h2 h3 h4 h5 h6)
|
|
.select { |title| item.key?(title) }
|
|
.map { |title| item[title].split(/\W+/) }
|
|
.flatten
|
|
.compact
|
|
.uniq
|
|
# Count how many words are in the text
|
|
weight = 0
|
|
words.each { |word| weight += 1 if content.include?(word) }
|
|
weight
|
|
end
|
|
|
|
# Will get a unique css selector for the node
|
|
def get_css_selector(node)
|
|
node.css_path.gsub('html > body > ', '')
|
|
end
|
|
|
|
# Get a list of items representing the different paragraphs
|
|
def get_paragraphs_from_html(html, base_data)
|
|
doc = Nokogiri::HTML(html)
|
|
paragraphs = []
|
|
doc.css('p').each_with_index do |p, index|
|
|
next unless p.text.size > 0
|
|
new_item = base_data.clone
|
|
new_item.merge!(get_previous_hx(p))
|
|
new_item[:objectID] = "#{new_item[:parent_id]}_#{index}"
|
|
new_item[:css_selector] = get_css_selector(p)
|
|
new_item[:raw_html] = p.to_s
|
|
new_item[:text] = p.content
|
|
new_item[:title_weight] = get_title_weight(p.text, new_item)
|
|
paragraphs << new_item
|
|
end
|
|
paragraphs
|
|
end
|
|
|
|
def push(items)
|
|
api_key = AlgoliaSearchJekyll.api_key
|
|
application_id = @config['algolia']['application_id']
|
|
index_name = @config['algolia']['index_name']
|
|
check_credentials(api_key, application_id, index_name)
|
|
|
|
Algolia.init(application_id: application_id, api_key: api_key)
|
|
index = Algolia::Index.new(index_name)
|
|
configure_index(index)
|
|
index.clear_index
|
|
|
|
items.each_slice(1000) do |batch|
|
|
Jekyll.logger.info "Indexing #{batch.size} items"
|
|
begin
|
|
index.add_objects(batch)
|
|
rescue StandardError => error
|
|
Jekyll.logger.error 'Algolia Error: HTTP Error'
|
|
Jekyll.logger.warn error.message
|
|
exit 1
|
|
end
|
|
end
|
|
|
|
Jekyll.logger.info "Indexing of #{items.size} items " \
|
|
"in #{index_name} done."
|
|
end
|
|
end
|
|
end
|