feat: enhance FlexSearch encoding for CJK support (#553)

- Added support for CJK (Chinese, Japanese, Korean) languages in FlexSearch encoding.
- Introduced `isCJK` function to detect language and select appropriate encoding method.
- Implemented `encodeCJK` and `encodeDefault` functions for different tokenization strategies.
This commit is contained in:
Xin 2025-01-18 13:54:54 -05:00 committed by GitHub
parent a1232ecf9f
commit 14036ffea6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -195,8 +195,19 @@ document.addEventListener("DOMContentLoaded", function () {
*/
async function preloadIndex() {
const tokenize = '{{- site.Params.search.flexsearch.tokenize | default "forward" -}}';
const isCJK = () => {
const lang = document.documentElement.lang || "en";
return lang.startsWith("zh") || lang.startsWith("ja") || lang.startsWith("ko");
}
const encodeCJK = (str) => str.replace(/[\x00-\x7F]/g, "").split("");
const encodeDefault = (str) => (""+str).toLocaleLowerCase().split(/[\p{Z}\p{S}\p{P}\p{C}]+/u);
const encodeFunction = isCJK() ? encodeCJK : encodeDefault;
window.pageIndex = new FlexSearch.Document({
tokenize,
encode: encodeFunction,
cache: 100,
document: {
id: 'id',
@ -207,6 +218,7 @@ document.addEventListener("DOMContentLoaded", function () {
window.sectionIndex = new FlexSearch.Document({
tokenize,
encode: encodeFunction,
cache: 100,
document: {
id: 'id',