jekyll-algolia/lib/jekyll/algolia/file_browser.rb

# frozen_string_literal: true

require 'algolia_html_extractor'
require 'pathname'
require 'time'

module Jekyll
  module Algolia
    # Module to get information about Jekyll file. Jekyll handles posts, pages,
    # collection, etc. They each need specific processing, so knowing which kind
    # of file we're working on will help.
    #
    # We also do not index all files. This module will help in defining which
    # files should be indexed and which should not.
    module FileBrowser
      include Jekyll::Algolia

      # Public: Return the absolute path of a Jekyll file
      #
      # file - The Jekyll file to inspect
      def self.absolute_path(filepath)
        pathname = Pathname.new(filepath)
        return pathname.cleanpath.to_s if pathname.absolute?

        File.expand_path(File.join(Configurator.get('source'), filepath))
      end

      # Public: Return the path of a Jekyll file relative to the Jekyll source
      #
      # file - The Jekyll file to inspect
      def self.relative_path(filepath)
        pathname = Pathname.new(filepath)
        config_source = Configurator.get('source') || ''
        jekyll_source = Pathname.new(File.expand_path(config_source))

        # Removing any starting ./
        if pathname.relative?
          fullpath = File.expand_path(File.join(jekyll_source, pathname))
          return fullpath.gsub(%r{^#{jekyll_source}/}, '')
        end

        pathname.relative_path_from(jekyll_source).cleanpath.to_s
      end

      # Public: Check if the file should be indexed
      #
      # file - The Jekyll file
      #
      # There are many reasons a file should not be indexed. We need to exclude
      # all the static assets, only keep the actual content.
      def self.indexable?(file)
        return false if static_file?(file)
        return false if is_404?(file)
        return false if redirect?(file)
        return false unless allowed_extension?(file)
        return false if excluded_from_config?(file)
        return false if excluded_from_hook?(file)

        true
      end

      # Public: Check if the specified file is a static Jekyll asset
      #
      # file - The Jekyll file
      #
      # We don't index static assets (js, css, images)
      def self.static_file?(file)
        file.is_a?(Jekyll::StaticFile)
      end

      # Public: Check if the file is a 404 error page
      #
      # file - The Jekyll file
      #
      # 404 pages are not Jekyll defaults but a convention adopted by GitHub
      # pages. We don't want to index those.
      # Source: https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
      #
      # rubocop:disable Naming/PredicateName
      def self.is_404?(file)
        ['404.md', '404.html'].include?(File.basename(file.path))
      end
      # rubocop:enable Naming/PredicateName

      # Public: Check if the file is redirect page
      #
      # file - The Jekyll file
      #
      # Plugins like jekyll-redirect-from add dynamic pages that only contain
      # an HTML meta refresh. We need to exclude those files from indexing.
      # https://github.com/jekyll/jekyll-redirect-from
      def self.redirect?(file)
        # When using redirect_from, jekyll-redirect-from creates a page named
        # `redirect.html`
        return true if file.respond_to?(:name) && file.name == 'redirect.html'
        # When using redirect_to, it sets the layout to `redirect`
        if file.respond_to?(:data) && file.data['layout'] == 'redirect'
          return true
        end

        false
      end

      # Public: Check if the file has one of the allowed extensions
      #
      # file - The Jekyll file
      #
      # Jekyll can transform markdown files to HTML by default. With plugins, it
      # can convert many more file formats. By default we'll only index markdown
      # and raw HTML files but this list can be extended using the
      # `extensions_to_index` config option.
      def self.allowed_extension?(file)
        extensions = Configurator.extensions_to_index
        extname = File.extname(file.path)[1..-1]
        extensions.include?(extname)
      end

      # Public: Check if the file has been excluded by `files_to_exclude`
      #
      # file - The Jekyll file
      def self.excluded_from_config?(file)
        excluded_patterns = Configurator.algolia('files_to_exclude')
        jekyll_source = Configurator.get('source')
        path = absolute_path(file.path)

        excluded_patterns.each do |pattern|
          pattern = File.expand_path(File.join(jekyll_source, pattern))
          return true if File.fnmatch(pattern, path, File::FNM_PATHNAME)
        end
        false
      end

      # Public: Check if the file has been excluded by running a custom user
      # hook
      #
      # file - The Jekyll file
      def self.excluded_from_hook?(file)
        Hooks.should_be_excluded?(file.path)
      end

      # Public: Return a hash of all the file metadata
      #
      # file - The Jekyll file
      #
      # It contains both the raw metadata extracted from the front-matter, as
      # well as more specific fields like the collection name, date timestamp,
      # slug, type and url
      def self.metadata(file)
        raw_data = raw_data(file)
        specific_data = {
          collection: collection(file),
          tags: tags(file),
          categories: categories(file),
          date: date(file),
          excerpt_html: excerpt_html(file),
          excerpt_text: excerpt_text(file),
          slug: slug(file),
          type: type(file),
          url: url(file)
        }

        metadata = Utils.compact_empty(raw_data.merge(specific_data))

        metadata
      end

      # Public: Return a hash of all the raw data, as defined in the
      # front-matter and including default values
      #
      # file - The Jekyll file
      #
      # Any custom data passed to the front-matter will be returned by this
      # method. It ignores any key where we have a better, custom, getter.

      # Note that even if you define tags and categories in a collection item,
      # it will not be included in the data. It's always an empty array.
      def self.raw_data(file)
        data = file.data.clone

        # Remove all keys where we have a specific getter
        data.each_key do |key|
          data.delete(key) if respond_to?(key)
        end
        data.delete('excerpt')

        # Delete other keys added by Jekyll that are not in the front-matter and
        # not needed for search
        data.delete('draft')
        data.delete('ext')

        # Convert all values to a version that can be serialized to JSON
        data = Utils.jsonify(data)

        # Convert all keys to symbols
        data = Utils.keys_to_symbols(data)

        data
      end

      # Public: Get the type of the document (page, post, collection, etc)
      #
      # file - The Jekyll file
      #
      # Pages are simple html and markdown documents in the tree
      # Elements from a collection are called Documents
      # Posts are a custom kind of Documents
      def self.type(file)
        type = file.class.name.split('::')[-1].downcase

        type = 'post' if type == 'document' && file.collection.label == 'posts'

        type
      end

      # Public: Returns the url of the file, starting from the root
      #
      # file - The Jekyll file
      def self.url(file)
        file.url
      end

      # Public: Returns the list of tags of a file, defaults to an empty array
      #
      # file - The Jekyll file
      def self.tags(file)
        file.data['tags'] || []
      end

      # Public: Returns the list of tags of a file, defaults to an empty array
      #
      # file - The Jekyll file
      def self.categories(file)
        file.data['categories'] || []
      end

      # Public: Returns a timestamp of the file date
      #
      # file - The Jekyll file
      #
      # Posts have their date coming from the filepath, or the front-matter.
      # Pages and other collection items can only have a date set in
      # front-matter.
      def self.date(file)
        # Collections get their date from .date, while pages read it from .data.
        # Jekyll by default will set the date of collection to the current date,
        # but we monkey-patched that so it returns nil for collection items
        date = if file.respond_to?(:date)
                 file.date
               else
                 file.data['date']
               end

        return nil if date.nil?

        # If date is a string, we try to parse it
        if date.is_a? String
          begin
            date = Time.parse(date)
          rescue StandardError
            return nil
          end
        end

        date.to_time.to_i
      end

      # Public: Returns the raw excerpt of a file, directly as returned by
      # Jekyll. Swallow any error that could occur when reading.
      #
      # file - The Jekyll file
      #
      # This might throw an exception if the excerpt is invalid. We also
      # silence all logger output as Jekyll is quite verbose and will display
      # the potential Liquid error in the terminal, even if we catch the actual
      # error.
      def self.excerpt_raw(file)
        Logger.silent do
          return file.data['excerpt'].to_s.strip
        end
      rescue StandardError
        nil
      end

      # Public: Return true if the Jekyll default excerpt should be used for
      # this file
      #
      # file - The Jekyll file
      #
      # Most of the time, we'll use our own excerpt (the first matching
      # element), but in some cases, we'll fallback to Jekyll's default excerpt
      # if it seems to be what the user wants
      def self.use_default_excerpt?(file)
        # Only posts can have excerpt
        return false unless type(file) == 'post'

        # User defined their own separator in the config
        custom_separator = file.excerpt_separator.to_s.strip
        return false if custom_separator.empty?

        # This specific post contains this separator
        file.content.include?(custom_separator)
      end

      # Public: Returns the HTML version of the excerpt
      #
      # file - The Jekyll file
      def self.excerpt_html(file)
        # If it's a post with a custom separator for the excerpt, we honor it
        return excerpt_raw(file) if use_default_excerpt?(file)

        # Otherwise we take the first matching node
        html = file.content
        selector = Configurator.algolia('nodes_to_index')
        first_node = Nokogiri::HTML(html).css(selector).first
        return nil if first_node.nil?

        first_node.to_s
      end

      # Public: Returns the text version of the excerpt
      #
      # file - The Jekyll file
      #
      # Only collections (including posts) have an excerpt. Pages don't.
      def self.excerpt_text(file)
        html = excerpt_html(file)
        Utils.html_to_text(html)
      end

      # Public: Returns the slug of the file
      #
      # file - The Jekyll file
      #
      # Slugs can be automatically extracted from collections, but for other
      # files, we have to create them from the basename
      def self.slug(file)
        # We get the real slug from the file data if available
        return file.data['slug'] if file.data.key?('slug')

        # We create it ourselves from the filepath otherwise
        File.basename(file.path, File.extname(file.path)).downcase
      end

      # Public: Returns the name of the collection
      #
      # file - The Jekyll file
      #
      # Only collection documents can have a collection name. Pages don't. Posts
      # are purposefully excluded from it as well even if they are technically
      # part of a collection
      def self.collection(file)
        return nil unless file.respond_to?(:collection)

        collection_name = file.collection.label

        # Posts are a special kind of collection, but it's an implementation
        # detail from my POV, so I'll exclude them
        return nil if collection_name == 'posts'

        collection_name
      end
    end
  end
end