Fixes #60. Using `redirect_from` from the `jekyll-redirect-from` plugin was throwing errors. Such redirect files are now correctly excluded from indexing.
365 lines
12 KiB
Ruby
365 lines
12 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require 'algolia_html_extractor'
|
|
require 'pathname'
|
|
require 'time'
|
|
|
|
module Jekyll
|
|
module Algolia
|
|
# Module to get information about Jekyll file. Jekyll handles posts, pages,
|
|
# collection, etc. They each need specific processing, so knowing which kind
|
|
# of file we're working on will help.
|
|
#
|
|
# We also do not index all files. This module will help in defining which
|
|
# files should be indexed and which should not.
|
|
module FileBrowser
|
|
include Jekyll::Algolia
|
|
|
|
# Public: Return the absolute path of a Jekyll file
|
|
#
|
|
# file - The Jekyll file to inspect
|
|
def self.absolute_path(filepath)
|
|
pathname = Pathname.new(filepath)
|
|
return pathname.cleanpath.to_s if pathname.absolute?
|
|
|
|
File.expand_path(File.join(Configurator.get('source'), filepath))
|
|
end
|
|
|
|
# Public: Return the path of a Jekyll file relative to the Jekyll source
|
|
#
|
|
# file - The Jekyll file to inspect
|
|
def self.relative_path(filepath)
|
|
pathname = Pathname.new(filepath)
|
|
config_source = Configurator.get('source') || ''
|
|
jekyll_source = Pathname.new(File.expand_path(config_source))
|
|
|
|
# Removing any starting ./
|
|
if pathname.relative?
|
|
fullpath = File.expand_path(File.join(jekyll_source, pathname))
|
|
return fullpath.gsub(%r{^#{jekyll_source}/}, '')
|
|
end
|
|
|
|
pathname.relative_path_from(jekyll_source).cleanpath.to_s
|
|
end
|
|
|
|
# Public: Check if the file should be indexed
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# There are many reasons a file should not be indexed. We need to exclude
|
|
# all the static assets, only keep the actual content.
|
|
def self.indexable?(file)
|
|
return false if static_file?(file)
|
|
return false if is_404?(file)
|
|
return false if redirect?(file)
|
|
return false unless allowed_extension?(file)
|
|
return false if excluded_from_config?(file)
|
|
return false if excluded_from_hook?(file)
|
|
|
|
true
|
|
end
|
|
|
|
# Public: Check if the specified file is a static Jekyll asset
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# We don't index static assets (js, css, images)
|
|
def self.static_file?(file)
|
|
file.is_a?(Jekyll::StaticFile)
|
|
end
|
|
|
|
# Public: Check if the file is a 404 error page
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# 404 pages are not Jekyll defaults but a convention adopted by GitHub
|
|
# pages. We don't want to index those.
|
|
# Source: https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
|
|
#
|
|
# rubocop:disable Naming/PredicateName
|
|
def self.is_404?(file)
|
|
['404.md', '404.html'].include?(File.basename(file.path))
|
|
end
|
|
# rubocop:enable Naming/PredicateName
|
|
|
|
# Public: Check if the file is redirect page
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# Plugins like jekyll-redirect-from add dynamic pages that only contain
|
|
# an HTML meta refresh. We need to exclude those files from indexing.
|
|
# https://github.com/jekyll/jekyll-redirect-from
|
|
def self.redirect?(file)
|
|
# When using redirect_from, jekyll-redirect-from creates a page named
|
|
# `redirect.html`
|
|
return true if file.respond_to?(:name) && file.name == 'redirect.html'
|
|
# When using redirect_to, it sets the layout to `redirect`
|
|
if file.respond_to?(:data) && file.data['layout'] == 'redirect'
|
|
return true
|
|
end
|
|
|
|
false
|
|
end
|
|
|
|
# Public: Check if the file has one of the allowed extensions
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# Jekyll can transform markdown files to HTML by default. With plugins, it
|
|
# can convert many more file formats. By default we'll only index markdown
|
|
# and raw HTML files but this list can be extended using the
|
|
# `extensions_to_index` config option.
|
|
def self.allowed_extension?(file)
|
|
extensions = Configurator.extensions_to_index
|
|
extname = File.extname(file.path)[1..-1]
|
|
extensions.include?(extname)
|
|
end
|
|
|
|
# Public: Check if the file has been excluded by `files_to_exclude`
|
|
#
|
|
# file - The Jekyll file
|
|
def self.excluded_from_config?(file)
|
|
excluded_patterns = Configurator.algolia('files_to_exclude')
|
|
jekyll_source = Configurator.get('source')
|
|
path = absolute_path(file.path)
|
|
|
|
excluded_patterns.each do |pattern|
|
|
pattern = File.expand_path(File.join(jekyll_source, pattern))
|
|
return true if File.fnmatch(pattern, path, File::FNM_PATHNAME)
|
|
end
|
|
false
|
|
end
|
|
|
|
# Public: Check if the file has been excluded by running a custom user
|
|
# hook
|
|
#
|
|
# file - The Jekyll file
|
|
def self.excluded_from_hook?(file)
|
|
Hooks.should_be_excluded?(file.path)
|
|
end
|
|
|
|
# Public: Return a hash of all the file metadata
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# It contains both the raw metadata extracted from the front-matter, as
|
|
# well as more specific fields like the collection name, date timestamp,
|
|
# slug, type and url
|
|
def self.metadata(file)
|
|
raw_data = raw_data(file)
|
|
specific_data = {
|
|
collection: collection(file),
|
|
tags: tags(file),
|
|
categories: categories(file),
|
|
date: date(file),
|
|
excerpt_html: excerpt_html(file),
|
|
excerpt_text: excerpt_text(file),
|
|
slug: slug(file),
|
|
type: type(file),
|
|
url: url(file)
|
|
}
|
|
|
|
metadata = Utils.compact_empty(raw_data.merge(specific_data))
|
|
|
|
metadata
|
|
end
|
|
|
|
# Public: Return a hash of all the raw data, as defined in the
|
|
# front-matter and including default values
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# Any custom data passed to the front-matter will be returned by this
|
|
# method. It ignores any key where we have a better, custom, getter.
|
|
|
|
# Note that even if you define tags and categories in a collection item,
|
|
# it will not be included in the data. It's always an empty array.
|
|
def self.raw_data(file)
|
|
data = file.data.clone
|
|
|
|
# Remove all keys where we have a specific getter
|
|
data.each_key do |key|
|
|
data.delete(key) if respond_to?(key)
|
|
end
|
|
data.delete('excerpt')
|
|
|
|
# Delete other keys added by Jekyll that are not in the front-matter and
|
|
# not needed for search
|
|
data.delete('draft')
|
|
data.delete('ext')
|
|
|
|
# Convert all values to a version that can be serialized to JSON
|
|
data = Utils.jsonify(data)
|
|
|
|
# Convert all keys to symbols
|
|
data = Utils.keys_to_symbols(data)
|
|
|
|
data
|
|
end
|
|
|
|
# Public: Get the type of the document (page, post, collection, etc)
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# Pages are simple html and markdown documents in the tree
|
|
# Elements from a collection are called Documents
|
|
# Posts are a custom kind of Documents
|
|
def self.type(file)
|
|
type = file.class.name.split('::')[-1].downcase
|
|
|
|
type = 'post' if type == 'document' && file.collection.label == 'posts'
|
|
|
|
type
|
|
end
|
|
|
|
# Public: Returns the url of the file, starting from the root
|
|
#
|
|
# file - The Jekyll file
|
|
def self.url(file)
|
|
file.url
|
|
end
|
|
|
|
# Public: Returns the list of tags of a file, defaults to an empty array
|
|
#
|
|
# file - The Jekyll file
|
|
def self.tags(file)
|
|
file.data['tags'] || []
|
|
end
|
|
|
|
# Public: Returns the list of tags of a file, defaults to an empty array
|
|
#
|
|
# file - The Jekyll file
|
|
def self.categories(file)
|
|
file.data['categories'] || []
|
|
end
|
|
|
|
# Public: Returns a timestamp of the file date
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# Posts have their date coming from the filepath, or the front-matter.
|
|
# Pages and other collection items can only have a date set in
|
|
# front-matter.
|
|
def self.date(file)
|
|
# Collections get their date from .date, while pages read it from .data.
|
|
# Jekyll by default will set the date of collection to the current date,
|
|
# but we monkey-patched that so it returns nil for collection items
|
|
date = if file.respond_to?(:date)
|
|
file.date
|
|
else
|
|
file.data['date']
|
|
end
|
|
|
|
return nil if date.nil?
|
|
|
|
# If date is a string, we try to parse it
|
|
if date.is_a? String
|
|
begin
|
|
date = Time.parse(date)
|
|
rescue StandardError
|
|
return nil
|
|
end
|
|
end
|
|
|
|
date.to_time.to_i
|
|
end
|
|
|
|
# Public: Returns the raw excerpt of a file, directly as returned by
|
|
# Jekyll. Swallow any error that could occur when reading.
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# This might throw an exception if the excerpt is invalid. We also
|
|
# silence all logger output as Jekyll is quite verbose and will display
|
|
# the potential Liquid error in the terminal, even if we catch the actual
|
|
# error.
|
|
def self.excerpt_raw(file)
|
|
Logger.silent do
|
|
return file.data['excerpt'].to_s.strip
|
|
end
|
|
rescue StandardError
|
|
nil
|
|
end
|
|
|
|
# Public: Return true if the Jekyll default excerpt should be used for
|
|
# this file
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# Most of the time, we'll use our own excerpt (the first matching
|
|
# element), but in some cases, we'll fallback to Jekyll's default excerpt
|
|
# if it seems to be what the user wants
|
|
def self.use_default_excerpt?(file)
|
|
# Only posts can have excerpt
|
|
return false unless type(file) == 'post'
|
|
|
|
# User defined their own separator in the config
|
|
custom_separator = file.excerpt_separator.to_s.strip
|
|
return false if custom_separator.empty?
|
|
|
|
# This specific post contains this separator
|
|
file.content.include?(custom_separator)
|
|
end
|
|
|
|
# Public: Returns the HTML version of the excerpt
|
|
#
|
|
# file - The Jekyll file
|
|
def self.excerpt_html(file)
|
|
# If it's a post with a custom separator for the excerpt, we honor it
|
|
return excerpt_raw(file) if use_default_excerpt?(file)
|
|
|
|
# Otherwise we take the first matching node
|
|
html = file.content
|
|
selector = Configurator.algolia('nodes_to_index')
|
|
first_node = Nokogiri::HTML(html).css(selector).first
|
|
return nil if first_node.nil?
|
|
|
|
first_node.to_s
|
|
end
|
|
|
|
# Public: Returns the text version of the excerpt
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# Only collections (including posts) have an excerpt. Pages don't.
|
|
def self.excerpt_text(file)
|
|
html = excerpt_html(file)
|
|
Utils.html_to_text(html)
|
|
end
|
|
|
|
# Public: Returns the slug of the file
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# Slugs can be automatically extracted from collections, but for other
|
|
# files, we have to create them from the basename
|
|
def self.slug(file)
|
|
# We get the real slug from the file data if available
|
|
return file.data['slug'] if file.data.key?('slug')
|
|
|
|
# We create it ourselves from the filepath otherwise
|
|
File.basename(file.path, File.extname(file.path)).downcase
|
|
end
|
|
|
|
# Public: Returns the name of the collection
|
|
#
|
|
# file - The Jekyll file
|
|
#
|
|
# Only collection documents can have a collection name. Pages don't. Posts
|
|
# are purposefully excluded from it as well even if they are technically
|
|
# part of a collection
|
|
def self.collection(file)
|
|
return nil unless file.respond_to?(:collection)
|
|
|
|
collection_name = file.collection.label
|
|
|
|
# Posts are a special kind of collection, but it's an implementation
|
|
# detail from my POV, so I'll exclude them
|
|
return nil if collection_name == 'posts'
|
|
|
|
collection_name
|
|
end
|
|
end
|
|
end
|
|
end
|