225 lines
7.0 KiB
Ruby
225 lines
7.0 KiB
Ruby
require 'algoliasearch'
|
|
require 'nokogiri'
|
|
require 'json'
|
|
|
|
# `jekyll algolia push` command
|
|
class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
class << self
|
|
def init_with_program(_prog)
|
|
end
|
|
|
|
def process(args = [], options = {}, config = {})
|
|
@args = args
|
|
@options = options
|
|
@config = config
|
|
|
|
index_name = args[0]
|
|
|
|
@config['algolia']['index_name'] = index_name if index_name
|
|
site = Jekyll::Site.new(@config)
|
|
|
|
# Instead of writing generated website to disk, we will push it to the
|
|
# index
|
|
def site.write
|
|
items = []
|
|
each_site_file do |file|
|
|
new_items = AlgoliaSearchJekyllPush.get_items_from_file(file)
|
|
next if new_items.nil?
|
|
items += new_items
|
|
end
|
|
AlgoliaSearchJekyllPush.push(items)
|
|
end
|
|
|
|
site.process
|
|
end
|
|
|
|
def markdown?(filename)
|
|
ext = File.extname(filename).delete('.')
|
|
@config['markdown_ext'].split(',').include?(ext)
|
|
end
|
|
|
|
def check_credentials(api_key, application_id, index_name)
|
|
unless api_key
|
|
Jekyll.logger.error 'Algolia Error: No API key defined'
|
|
Jekyll.logger.warn ' You have two ways to configure your API key:'
|
|
Jekyll.logger.warn ' - The ALGOLIA_API_KEY environment variable'
|
|
Jekyll.logger.warn ' - A file named ./_algolia_api_key'
|
|
exit 1
|
|
end
|
|
|
|
unless application_id
|
|
Jekyll.logger.error 'Algolia Error: No application ID defined'
|
|
Jekyll.logger.warn ' Please set your application id in the '\
|
|
'_config.yml file, like so:'
|
|
puts ''
|
|
# The spaces are needed otherwise the text is centered
|
|
Jekyll.logger.warn ' algolia: '
|
|
Jekyll.logger.warn ' application_id: \'{your_application_id}\''
|
|
puts ''
|
|
Jekyll.logger.warn ' Your application ID can be found in your algolia'\
|
|
' dashboard'
|
|
Jekyll.logger.warn ' https://www.algolia.com/licensing'
|
|
exit 1
|
|
end
|
|
|
|
unless index_name
|
|
Jekyll.logger.error 'Algolia Error: No index name defined'
|
|
Jekyll.logger.warn ' Please set your index name in the _config.yml'\
|
|
' file, like so:'
|
|
puts ''
|
|
# The spaces are needed otherwise the text is centered
|
|
Jekyll.logger.warn ' algolia: '
|
|
Jekyll.logger.warn ' index_name: \'{your_index_name}\''
|
|
puts ''
|
|
Jekyll.logger.warn ' You can edit your indices in your dashboard'
|
|
Jekyll.logger.warn ' https://www.algolia.com/explorer'
|
|
exit 1
|
|
end
|
|
true
|
|
end
|
|
|
|
def configure_index(index)
|
|
index.set_settings(
|
|
attributeForDistinct: 'parent_id',
|
|
attributesForFaceting: %w(tags type),
|
|
attributesToHighlight: %w(title content),
|
|
attributesToIndex: %w(title h1 h2 h3 h4 h5 h6 content tags),
|
|
attributesToRetrieve: %w(title posted_at content url css_selector),
|
|
customRanking: ['desc(posted_at)', 'desc(title_weight)'],
|
|
distinct: true,
|
|
highlightPreTag: '<span class="algolia__result-highlight">',
|
|
highlightPostTag: '</span>'
|
|
)
|
|
end
|
|
|
|
def push(items)
|
|
api_key = AlgoliaSearchJekyll.api_key
|
|
application_id = @config['algolia']['application_id']
|
|
index_name = @config['algolia']['index_name']
|
|
check_credentials(api_key, application_id, index_name)
|
|
|
|
Algolia.init(application_id: application_id, api_key: api_key)
|
|
index = Algolia::Index.new(index_name)
|
|
configure_index(index)
|
|
index.clear_index
|
|
|
|
items.each_slice(1000) do |batch|
|
|
Jekyll.logger.info "Indexing #{batch.size} items"
|
|
begin
|
|
index.add_objects(batch)
|
|
rescue StandardError => error
|
|
Jekyll.logger.error 'Algolia Error: HTTP Error'
|
|
Jekyll.logger.warn error.message
|
|
exit 1
|
|
end
|
|
end
|
|
|
|
Jekyll.logger.info "Indexing of #{items.size} items done."
|
|
end
|
|
|
|
def get_items_from_file(file)
|
|
is_page = file.is_a?(Jekyll::Page)
|
|
is_post = file.is_a?(Jekyll::Post)
|
|
|
|
# We only index posts, and markdown pages
|
|
return nil unless is_page || is_post
|
|
return nil if is_page && !markdown?(file.path)
|
|
|
|
html = file.content.gsub("\n", ' ')
|
|
|
|
if is_post
|
|
tags = get_tags_from_post(file)
|
|
base_data = {
|
|
type: 'post',
|
|
parent_id: file.id,
|
|
url: file.url,
|
|
title: file.title,
|
|
tags: tags,
|
|
slug: file.slug,
|
|
posted_at: file.date.to_time.to_i
|
|
}
|
|
else
|
|
base_data = {
|
|
type: 'page',
|
|
parent_id: file.basename,
|
|
url: file.url,
|
|
title: file['title'],
|
|
slug: file.basename
|
|
}
|
|
end
|
|
|
|
get_paragraphs_from_html(html, base_data)
|
|
end
|
|
|
|
# Get a list of tags from a post. Handle both classic string tags or
|
|
# extended object tags
|
|
def get_tags_from_post(post)
|
|
tags = post.tags
|
|
return [] if tags.is_a?(Array) || tags.nil?
|
|
tags.map! { |tag| tag.to_s.gsub(',', '') }
|
|
end
|
|
|
|
def get_previous_hx(node, memo = { level: 7 })
|
|
previous = node.previous_sibling
|
|
# Stop if no previous element
|
|
unless previous
|
|
memo.delete(:level)
|
|
return memo
|
|
end
|
|
|
|
# Skip non-html elements
|
|
return get_previous_hx(previous, memo) unless previous.element?
|
|
|
|
# Skip non-title elements
|
|
tag_name = previous.name
|
|
possible_title_elements = %w(h1 h2 h3 h4 h5 h6)
|
|
unless possible_title_elements.include?(tag_name)
|
|
return get_previous_hx(previous, memo)
|
|
end
|
|
|
|
# Skip if item already as title of a higher level
|
|
title_level = tag_name.gsub('h', '').to_i
|
|
return get_previous_hx(previous, memo) if title_level >= memo[:level]
|
|
memo[:level] = title_level
|
|
|
|
# Add to the memo and continue
|
|
memo[tag_name.to_sym] = previous.text
|
|
get_previous_hx(previous, memo)
|
|
end
|
|
|
|
# Get a custom value representing the number of word occurence from the
|
|
# titles into the content
|
|
def get_title_weight(content, item)
|
|
# Get list of words
|
|
words = %i(title h1 h2 h3 h4 h5 h6)
|
|
.select { |title| item.key?(title) }
|
|
.map { |title| item[title].split(/\W+/) }
|
|
.flatten
|
|
.compact
|
|
.uniq
|
|
# Count how many words are in the text
|
|
weight = 0
|
|
words.each { |word| weight += 1 if content.include?(word) }
|
|
weight
|
|
end
|
|
|
|
# Will get a unique css selector for the node
|
|
def get_css_selector(node)
|
|
node.css_path.gsub('html > body > ', '')
|
|
end
|
|
|
|
def get_paragraphs_from_html(html, base_data)
|
|
doc = Nokogiri::HTML(html)
|
|
doc.css('p').map.with_index do |p, index|
|
|
new_item = base_data.clone
|
|
new_item.merge!(get_previous_hx(p))
|
|
new_item[:objectID] = "#{new_item[:parent_id]}_#{index}"
|
|
new_item[:css_selector] = get_css_selector(p)
|
|
new_item[:content] = p.to_s
|
|
new_item[:title_weight] = get_title_weight(p.text, new_item)
|
|
new_item
|
|
end
|
|
end
|
|
end
|
|
end
|