# require 'algoliasearch' # require 'nokogiri' # require 'json' require 'algolia_html_extractor' module Jekyll module Algolia # Module to extract records from Jekyll files module Extractor include Jekyll::Algolia # Public: Extract records from the file # # file - The Jekyll file to process def self.run(file) # Getting all hierarchical nodes from the HTML input raw_records = extract_raw_records(file.content) raw_records end # Public: Extract raw records from the file, including content for each # node to index and hierarchy # # content - The HTML content to parse def self.extract_raw_records(content) AlgoliaHTMLExtractor.new( content, options: { css_selector: Configurator.algolia('nodes_to_index') } ).extract end end end end # attr_reader :file # # def initialize(file) # @file = file # @config = file.site.config # default_config = { # 'nodes_to_index' => 'p' # } # @config = default_config.merge(file.site.config['algolia']) # end # # # Hook to modify a record after extracting # def custom_hook_each(item, _node) # item # end # # # Hook to modify all records after extracting # def custom_hook_all(items) # items # end # # ## # # Return the type of the Jekyll element # # It can be either page, post or document # def type # classname = @file.class.name # subclass = classname.split('::')[1] # type = subclass.downcase # # # Post are actually a specific type of Documents # if type == 'document' # collection_name = @file.collection.label # return 'post' if collection_name == 'posts' # end # # type # end # # ## # # Return the url of the page # def url # @file.url # end # # ## # # Return the title of the page # def title # @file.data['title'] # end # # ## # # Returns the slug of the document # def slug # # We can guess the slug from the filename for all documents # basename = File.basename(@file.path) # extname = File.extname(basename) # slug = File.basename(basename, extname) # # # Jekyll v3 posts have it in data # return @file.data['slug'] if @file.data.key?('slug') # # # Jekyll v2 posts have a specific slug method # return @file.slug if @file.respond_to?(:slug) # # slug # end # # ## # # Get an array of tags of the document # def tags # tags = [] # # has_tags_data = @file.data.key?('tags') # # # All tags are in data['tags'] # tags = @file.data['tags'] if has_tags_data # # # Some extension extends the tags with custom classes, so we make sure we # # cast them as strings # tags.map(&:to_s) # end # # ## # # Get the post date timestamp # def date # return nil unless @file.respond_to?(:date) # # @file.date.to_time.to_i # end # # ## # # Get the collection name of a document # def collection # return nil unless @file.respond_to?(:collection) # # collection_name = @file.collection.label # # # In Jekyll v3, posts are actually a collection # return nil if collection_name == 'posts' # collection_name # end # # ## # # Get a hash of all front-matter data # def front_matter # raw_data = @file.data # # # We clean some keys that will be handled by specific methods # attributes_to_remove = %w(title tags slug url date type) # attributes_to_remove.each do |attribute| # raw_data.delete(attribute) # end # # # Convert to symbols # data = {} # raw_data.each do |key, value| # data[key.to_sym] = value # end # # data # end # # ## # # Get the list of all node data # def hierarchy_nodes # extractor_options = { # css_selector: @config['nodes_to_index'] # } # # AlgoliaHTMLExtractor.new( # @file.content, # options: extractor_options # ).extract # end # # # Extract all records from the page and return the list # def extract # # Getting all hierarchical nodes from the HTML input # raw_items = hierarchy_nodes # # # Shared attributes relative to the page that all records will have # shared_attributes = { # type: type, # url: url, # title: title, # slug: slug, # date: date, # collection: collection, # tags: tags # } # # Remove empty attributes # shared_attributes = shared_attributes.delete_if do |_, value| # value.nil? # end # # # Enriching with page metadata # items = [] # raw_items.each do |raw_item| # nokogiri_node = raw_item[:node] # raw_item.delete(:node) # item = shared_attributes.merge(raw_item) # item[:objectID] = item[:uuid] # item.delete(:uuid) # # item = custom_hook_each(item, nokogiri_node) # next if item.nil? # # items << item # end # # custom_hook_all(items) # end # end