77 lines
2.2 KiB
JavaScript
77 lines
2.2 KiB
JavaScript
|
/*!
|
||
|
* lunr.tokenizer
|
||
|
* Copyright (C) @YEAR Oliver Nightingale
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* A function for splitting a string into tokens ready to be inserted into
|
||
|
* the search index. Uses `lunr.tokenizer.separator` to split strings, change
|
||
|
* the value of this property to change how strings are split into tokens.
|
||
|
*
|
||
|
* This tokenizer will convert its parameter to a string by calling `toString` and
|
||
|
* then will split this string on the character in `lunr.tokenizer.separator`.
|
||
|
* Arrays will have their elements converted to strings and wrapped in a lunr.Token.
|
||
|
*
|
||
|
* Optional metadata can be passed to the tokenizer, this metadata will be cloned and
|
||
|
* added as metadata to every token that is created from the object to be tokenized.
|
||
|
*
|
||
|
* @static
|
||
|
* @param {?(string|object|object[])} obj - The object to convert into tokens
|
||
|
* @param {?object} metadata - Optional metadata to associate with every token
|
||
|
* @returns {lunr.Token[]}
|
||
|
* @see {@link lunr.Pipeline}
|
||
|
*/
|
||
|
lunr.tokenizer = function (obj, metadata) {
|
||
|
if (obj == null || obj == undefined) {
|
||
|
return []
|
||
|
}
|
||
|
|
||
|
if (Array.isArray(obj)) {
|
||
|
return obj.map(function (t) {
|
||
|
return new lunr.Token(
|
||
|
lunr.utils.asString(t).toLowerCase(),
|
||
|
lunr.utils.clone(metadata)
|
||
|
)
|
||
|
})
|
||
|
}
|
||
|
|
||
|
var str = obj.toString().toLowerCase(),
|
||
|
len = str.length,
|
||
|
tokens = []
|
||
|
|
||
|
for (var sliceEnd = 0, sliceStart = 0; sliceEnd <= len; sliceEnd++) {
|
||
|
var char = str.charAt(sliceEnd),
|
||
|
sliceLength = sliceEnd - sliceStart
|
||
|
|
||
|
if ((char.match(lunr.tokenizer.separator) || sliceEnd == len)) {
|
||
|
|
||
|
if (sliceLength > 0) {
|
||
|
var tokenMetadata = lunr.utils.clone(metadata) || {}
|
||
|
tokenMetadata["position"] = [sliceStart, sliceLength]
|
||
|
tokenMetadata["index"] = tokens.length
|
||
|
|
||
|
tokens.push(
|
||
|
new lunr.Token (
|
||
|
str.slice(sliceStart, sliceEnd),
|
||
|
tokenMetadata
|
||
|
)
|
||
|
)
|
||
|
}
|
||
|
|
||
|
sliceStart = sliceEnd + 1
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
return tokens
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* The separator used to split a string into tokens. Override this property to change the behaviour of
|
||
|
* `lunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens.
|
||
|
*
|
||
|
* @static
|
||
|
* @see lunr.tokenizer
|
||
|
*/
|
||
|
lunr.tokenizer.separator = /[\s\-]+/
|