allenfrostline

Content Filtering with Hexo Tipue-Search Engine


2019-05-03

It’s always been a headache to me that I cannot have my blog’s search engine to show content I want — there’re always something you don’t want ’em to show up in a search result, like password protected posts (shown as encrypted codes) and random pages for a certain project (some even don’t have a title, and this tipue-search would still show them in the searching result — with a blank title and a bunch of html raw codes). Even worse, it seems there’s no offical way to set this sort of content filters. This feels bad. This terrible feeling has tortured me for months till I made up my mind and fixed it from source codes today.

The fix turned out, well, quite straightforward. First, we locate the node package folder hexo-generator-tipue-search-json. The package structure shows

node_modules
└───hexo-generator-tipue-search-json
    ├───index.js
    ├───LICENSE
    ├───package.json
    ├───README.md
    └───node_modules
        └───...

The file we need to edit is index.js. Below I’ve attached the full codes after modification:

var util = require('hexo-util');

hexo.extend.generator.register('tipue-search-json', hexo_generator_tipue_search_json);

function hexo_generator_tipue_search_json(site) {
    var minify = function (str) {
            return util.stripHTML(str).trim().replace(/\n/g, ' ').replace(/\s+/g, ' ');
        },

        keys = {
            title: true,
            url: true,
            text: true,
            tags: true
        },

        json = {};


    var catags = function (item) {
            return item.name.replace(/\s+/g, '-').toLowerCase();
        };

    var postsContent = site.posts.sort('-date').filter(function (post) {
            return (post.published && !('password' in post));  // 不搜索加密文章
        }).map(function (post) {
            var actualPost = {};

            Object.getOwnPropertyNames(keys).forEach(function (item) {
                switch (item) {
                    case 'text':
                        return actualPost[item] = minify(post.content);

                    case 'tags':
                        return actualPost[item] = post.tags.map(catags).join(' ');

                    case 'url':
                        return actualPost[item] = hexo.config.root + post['path'];

                    default:
                        return actualPost[item] = post[item];
                }
            });

            return actualPost;
        });

    // var pagesContent = site.pages.sort('-date').map(function (page) {
    //         var actualPage = {};
    
    //         Object.getOwnPropertyNames(keys).forEach(function (item) {
    //             switch (item) {
    //                 case 'text':
    //                     return actualPage[item] = minify(page.content);
    
    //                 case 'url':
    //                     return actualPage[item] = hexo.config.root + page['path'];
    
    //                 default:
    //                     return actualPage[item] = page[item];
    //             }
    //         });
    //         actualPage.tags = "";
    //         return actualPage;
    //     });

    // json.pages = postsContent.concat(pagesContent);  // 不要搜索pages
    json.pages = postsContent;

    return {
        path: '/tipuesearch/tipuesearch_content.json',
        data: JSON.stringify(json)
    };
}

Note the second line of the definition of postsContent and the lines we comment out. These modifications are made such that encrypted posts and standalone pages won’t be searched.