/*! * lunr.tokenizer * Copyright (C) @YEAR Oliver Nightingale */ /** * A function for splitting a string into tokens ready to be inserted into * the search index. Uses `lunr.tokenizer.separator` to split strings, change * the value of this property to change how strings are split into tokens. * * This tokenizer will convert its parameter to a string by calling `toString` and * then will split this string on the character in `lunr.tokenizer.separator`. * Arrays will have their elements converted to strings and wrapped in a lunr.Token. * * Optional metadata can be passed to the tokenizer, this metadata will be cloned and * added as metadata to every token that is created from the object to be tokenized. * * @static * @param {?(string|object|object[])} obj - The object to convert into tokens * @param {?object} metadata - Optional metadata to associate with every token * @returns {lunr.Token[]} * @see {@link lunr.Pipeline} */ lunr.tokenizer = function (obj, metadata) { if (obj == null || obj == undefined) { return [] } if (Array.isArray(obj)) { return obj.map(function (t) { return new lunr.Token( lunr.utils.asString(t).toLowerCase(), lunr.utils.clone(metadata) ) }) } var str = obj.toString().toLowerCase(), len = str.length, tokens = [] for (var sliceEnd = 0, sliceStart = 0; sliceEnd <= len; sliceEnd++) { var char = str.charAt(sliceEnd), sliceLength = sliceEnd - sliceStart if ((char.match(lunr.tokenizer.separator) || sliceEnd == len)) { if (sliceLength > 0) { var tokenMetadata = lunr.utils.clone(metadata) || {} tokenMetadata["position"] = [sliceStart, sliceLength] tokenMetadata["index"] = tokens.length tokens.push( new lunr.Token ( str.slice(sliceStart, sliceEnd), tokenMetadata ) ) } sliceStart = sliceEnd + 1 } } return tokens } /** * The separator used to split a string into tokens. Override this property to change the behaviour of * `lunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens. * * @static * @see lunr.tokenizer */ lunr.tokenizer.separator = /[\s\-]+/