210 lines
4.9 KiB
Ruby
210 lines
4.9 KiB
Ruby
# require 'algoliasearch'
|
|
# require 'nokogiri'
|
|
# require 'json'
|
|
require 'algolia_html_extractor'
|
|
|
|
module Jekyll
|
|
module Algolia
|
|
# Module to extract records from Jekyll files
|
|
module Extractor
|
|
include Jekyll::Algolia
|
|
|
|
# Public: Extract records from the file
|
|
#
|
|
# file - The Jekyll file to process
|
|
def self.run(file)
|
|
# Getting all hierarchical nodes from the HTML input
|
|
raw_records = extract_raw_records(file.content)
|
|
|
|
raw_records
|
|
end
|
|
|
|
# Public: Extract raw records from the file, including content for each
|
|
# node to index and hierarchy
|
|
#
|
|
# content - The HTML content to parse
|
|
def self.extract_raw_records(content)
|
|
AlgoliaHTMLExtractor.new(
|
|
content,
|
|
options: {
|
|
css_selector: Configurator.algolia('nodes_to_index')
|
|
}
|
|
).extract
|
|
end
|
|
end
|
|
end
|
|
end
|
|
# attr_reader :file
|
|
#
|
|
# def initialize(file)
|
|
# @file = file
|
|
# @config = file.site.config
|
|
# default_config = {
|
|
# 'nodes_to_index' => 'p'
|
|
# }
|
|
# @config = default_config.merge(file.site.config['algolia'])
|
|
# end
|
|
#
|
|
# # Hook to modify a record after extracting
|
|
# def custom_hook_each(item, _node)
|
|
# item
|
|
# end
|
|
#
|
|
# # Hook to modify all records after extracting
|
|
# def custom_hook_all(items)
|
|
# items
|
|
# end
|
|
#
|
|
# ##
|
|
# # Return the type of the Jekyll element
|
|
# # It can be either page, post or document
|
|
# def type
|
|
# classname = @file.class.name
|
|
# subclass = classname.split('::')[1]
|
|
# type = subclass.downcase
|
|
#
|
|
# # Post are actually a specific type of Documents
|
|
# if type == 'document'
|
|
# collection_name = @file.collection.label
|
|
# return 'post' if collection_name == 'posts'
|
|
# end
|
|
#
|
|
# type
|
|
# end
|
|
#
|
|
# ##
|
|
# # Return the url of the page
|
|
# def url
|
|
# @file.url
|
|
# end
|
|
#
|
|
# ##
|
|
# # Return the title of the page
|
|
# def title
|
|
# @file.data['title']
|
|
# end
|
|
#
|
|
# ##
|
|
# # Returns the slug of the document
|
|
# def slug
|
|
# # We can guess the slug from the filename for all documents
|
|
# basename = File.basename(@file.path)
|
|
# extname = File.extname(basename)
|
|
# slug = File.basename(basename, extname)
|
|
#
|
|
# # Jekyll v3 posts have it in data
|
|
# return @file.data['slug'] if @file.data.key?('slug')
|
|
#
|
|
# # Jekyll v2 posts have a specific slug method
|
|
# return @file.slug if @file.respond_to?(:slug)
|
|
#
|
|
# slug
|
|
# end
|
|
#
|
|
# ##
|
|
# # Get an array of tags of the document
|
|
# def tags
|
|
# tags = []
|
|
#
|
|
# has_tags_data = @file.data.key?('tags')
|
|
#
|
|
# # All tags are in data['tags']
|
|
# tags = @file.data['tags'] if has_tags_data
|
|
#
|
|
# # Some extension extends the tags with custom classes, so we make sure we
|
|
# # cast them as strings
|
|
# tags.map(&:to_s)
|
|
# end
|
|
#
|
|
# ##
|
|
# # Get the post date timestamp
|
|
# def date
|
|
# return nil unless @file.respond_to?(:date)
|
|
#
|
|
# @file.date.to_time.to_i
|
|
# end
|
|
#
|
|
# ##
|
|
# # Get the collection name of a document
|
|
# def collection
|
|
# return nil unless @file.respond_to?(:collection)
|
|
#
|
|
# collection_name = @file.collection.label
|
|
#
|
|
# # In Jekyll v3, posts are actually a collection
|
|
# return nil if collection_name == 'posts'
|
|
# collection_name
|
|
# end
|
|
#
|
|
# ##
|
|
# # Get a hash of all front-matter data
|
|
# def front_matter
|
|
# raw_data = @file.data
|
|
#
|
|
# # We clean some keys that will be handled by specific methods
|
|
# attributes_to_remove = %w(title tags slug url date type)
|
|
# attributes_to_remove.each do |attribute|
|
|
# raw_data.delete(attribute)
|
|
# end
|
|
#
|
|
# # Convert to symbols
|
|
# data = {}
|
|
# raw_data.each do |key, value|
|
|
# data[key.to_sym] = value
|
|
# end
|
|
#
|
|
# data
|
|
# end
|
|
#
|
|
# ##
|
|
# # Get the list of all node data
|
|
# def hierarchy_nodes
|
|
# extractor_options = {
|
|
# css_selector: @config['nodes_to_index']
|
|
# }
|
|
#
|
|
# AlgoliaHTMLExtractor.new(
|
|
# @file.content,
|
|
# options: extractor_options
|
|
# ).extract
|
|
# end
|
|
#
|
|
# # Extract all records from the page and return the list
|
|
# def extract
|
|
# # Getting all hierarchical nodes from the HTML input
|
|
# raw_items = hierarchy_nodes
|
|
#
|
|
# # Shared attributes relative to the page that all records will have
|
|
# shared_attributes = {
|
|
# type: type,
|
|
# url: url,
|
|
# title: title,
|
|
# slug: slug,
|
|
# date: date,
|
|
# collection: collection,
|
|
# tags: tags
|
|
# }
|
|
# # Remove empty attributes
|
|
# shared_attributes = shared_attributes.delete_if do |_, value|
|
|
# value.nil?
|
|
# end
|
|
#
|
|
# # Enriching with page metadata
|
|
# items = []
|
|
# raw_items.each do |raw_item|
|
|
# nokogiri_node = raw_item[:node]
|
|
# raw_item.delete(:node)
|
|
# item = shared_attributes.merge(raw_item)
|
|
# item[:objectID] = item[:uuid]
|
|
# item.delete(:uuid)
|
|
#
|
|
# item = custom_hook_each(item, nokogiri_node)
|
|
# next if item.nil?
|
|
#
|
|
# items << item
|
|
# end
|
|
#
|
|
# custom_hook_all(items)
|
|
# end
|
|
# end
|