210 lines
4.9 KiB
Ruby

# require 'algoliasearch'
# require 'nokogiri'
# require 'json'
require 'algolia_html_extractor'
module Jekyll
module Algolia
# Module to extract records from Jekyll files
module Extractor
include Jekyll::Algolia
# Public: Extract records from the file
#
# file - The Jekyll file to process
def self.run(file)
# Getting all hierarchical nodes from the HTML input
raw_records = extract_raw_records(file.content)
raw_records
end
# Public: Extract raw records from the file, including content for each
# node to index and hierarchy
#
# content - The HTML content to parse
def self.extract_raw_records(content)
AlgoliaHTMLExtractor.new(
content,
options: {
css_selector: Configurator.algolia('nodes_to_index')
}
).extract
end
end
end
end
# attr_reader :file
#
# def initialize(file)
# @file = file
# @config = file.site.config
# default_config = {
# 'nodes_to_index' => 'p'
# }
# @config = default_config.merge(file.site.config['algolia'])
# end
#
# # Hook to modify a record after extracting
# def custom_hook_each(item, _node)
# item
# end
#
# # Hook to modify all records after extracting
# def custom_hook_all(items)
# items
# end
#
# ##
# # Return the type of the Jekyll element
# # It can be either page, post or document
# def type
# classname = @file.class.name
# subclass = classname.split('::')[1]
# type = subclass.downcase
#
# # Post are actually a specific type of Documents
# if type == 'document'
# collection_name = @file.collection.label
# return 'post' if collection_name == 'posts'
# end
#
# type
# end
#
# ##
# # Return the url of the page
# def url
# @file.url
# end
#
# ##
# # Return the title of the page
# def title
# @file.data['title']
# end
#
# ##
# # Returns the slug of the document
# def slug
# # We can guess the slug from the filename for all documents
# basename = File.basename(@file.path)
# extname = File.extname(basename)
# slug = File.basename(basename, extname)
#
# # Jekyll v3 posts have it in data
# return @file.data['slug'] if @file.data.key?('slug')
#
# # Jekyll v2 posts have a specific slug method
# return @file.slug if @file.respond_to?(:slug)
#
# slug
# end
#
# ##
# # Get an array of tags of the document
# def tags
# tags = []
#
# has_tags_data = @file.data.key?('tags')
#
# # All tags are in data['tags']
# tags = @file.data['tags'] if has_tags_data
#
# # Some extension extends the tags with custom classes, so we make sure we
# # cast them as strings
# tags.map(&:to_s)
# end
#
# ##
# # Get the post date timestamp
# def date
# return nil unless @file.respond_to?(:date)
#
# @file.date.to_time.to_i
# end
#
# ##
# # Get the collection name of a document
# def collection
# return nil unless @file.respond_to?(:collection)
#
# collection_name = @file.collection.label
#
# # In Jekyll v3, posts are actually a collection
# return nil if collection_name == 'posts'
# collection_name
# end
#
# ##
# # Get a hash of all front-matter data
# def front_matter
# raw_data = @file.data
#
# # We clean some keys that will be handled by specific methods
# attributes_to_remove = %w(title tags slug url date type)
# attributes_to_remove.each do |attribute|
# raw_data.delete(attribute)
# end
#
# # Convert to symbols
# data = {}
# raw_data.each do |key, value|
# data[key.to_sym] = value
# end
#
# data
# end
#
# ##
# # Get the list of all node data
# def hierarchy_nodes
# extractor_options = {
# css_selector: @config['nodes_to_index']
# }
#
# AlgoliaHTMLExtractor.new(
# @file.content,
# options: extractor_options
# ).extract
# end
#
# # Extract all records from the page and return the list
# def extract
# # Getting all hierarchical nodes from the HTML input
# raw_items = hierarchy_nodes
#
# # Shared attributes relative to the page that all records will have
# shared_attributes = {
# type: type,
# url: url,
# title: title,
# slug: slug,
# date: date,
# collection: collection,
# tags: tags
# }
# # Remove empty attributes
# shared_attributes = shared_attributes.delete_if do |_, value|
# value.nil?
# end
#
# # Enriching with page metadata
# items = []
# raw_items.each do |raw_item|
# nokogiri_node = raw_item[:node]
# raw_item.delete(:node)
# item = shared_attributes.merge(raw_item)
# item[:objectID] = item[:uuid]
# item.delete(:uuid)
#
# item = custom_hook_each(item, nokogiri_node)
# next if item.nil?
#
# items << item
# end
#
# custom_hook_all(items)
# end
# end