Compacting Lunr search indices

Lunr is a small JavaScript library for full-text search, which I recently used to implement client-side search for this site. The user experience of client-side search depends in part on how large the search index is, and Lunr's default JSON encoding is more verbose than it needs to be. This page describes a more compact encoding that can reduce the serialized index size by about 40%.

I'll be using the Project Gutenberg editions of Moby Dick and Pride and Prejudice as the example search corpus, with the Project Gutenberg metadata and licensing information trimmed off.

curl -L -O https://www.gutenberg.org/files/1342/1342-0.txt
curl -L -O https://www.gutenberg.org/files/2701/2701-0.txt
shasum -a 256 *.txt
# c3fc0e1900e233a0c3c6ca5784a54a3d3aaf00d40603315c644487bd7a07e22f  1342-0.txt
# 61d5ab6a3910fab66eabc9d2fc708b68b756199cb754fd5ff51751dbe5f766cd  2701-0.txt
tail -n +168 1342-0.txt | head -n 14060 > pride-and-prejudice.txt
tail -n +337 2701-0.txt | head -n 21624 > moby-dick.txt

A basic Lunr indexing program uses a lunr.Builder to assemble the index, then converts it to JSON with toJSON().

import * as fs from "fs";
import lunr from "lunr";

function main(argv) {
	if (argv.length < 3) {
		console.error("usage: lunr-index FILES...");
		process.exit(1);
	}
	const fileNames = argv.slice(2);

	const idx = lunr((builder) => {
		builder.ref("name");
		builder.field("text");
		builder.pipeline.remove(lunr.stemmer);
		fileNames.forEach((fileName => {
			builder.add({
				name: fileName,
				text: fs.readFileSync(fileName)
			});
		}));
	});
	process.stdout.write(JSON.stringify(indexToJSON(idx)));
}

function indexToJSON(idx) {
	return idx.toJSON();
}

main(process.argv);

The resulting index JSON is about 1.68 MB raw, 250 KB with gzip, or 192 KB with Brotli. These two compression formats are useful because they can be consumed directly by common web browsers.

I also tried compressing it with zstd, which I expected to provide the best compression ratio, but in this case Brotli performed better.

node lunr-index-v1.js moby-dick.txt pride-and-prejudice.txt > index-v1.json
gzip -9k index-v1.json
brotli -q 11 -w 24 index-v1.json
zstd -19 index-v1.json
# index-v1.json        : 12.76%   (  1.61 MiB =>    211 KiB, index-v1.json.zst)
ls -lS --si index-v1.*
# -rw-r--r-- 1 john john 1.7M May 27 16:08 index-v1.json
# -rw-r--r-- 1 john john 249k May 27 16:08 index-v1.json.gz
# -rw-r--r-- 1 john john 216k May 27 16:08 index-v1.json.zst
# -rw-r--r-- 1 john john 192k May 27 16:08 index-v1.json.br

Compacting the inverted index

The Lunr index JSON contains two big lists, fieldVectors and invertedIndex. Of those, the inverted index is far bigger, and has more opportunity for space savings.

{
	"fields": ["text"],
	"fieldVectors": [
		[ "text/moby-dick.txt", [ /* ... */ ] ],
		[ "text/pride-and-prejudice.txt", [ /* ... */ ] ] ],
	"invertedIndex": [
		[ "1", {
			"_index": 1363,
			"text": {
				"moby-dick.txt": {},
				"pride-and-prejudice.txt": {}
			} } ],
		[ "1,000,000", {
			"_index": 7298,
			"text": {
				"moby-dick.txt": {}
			} } ],
		// ...
	] }

Notice how much duplicate text there is:

  • The index is semantically a list ordered by when the token was first observed, but is stored sorted by the token value. The _index property wouldn't be needed if the index was stored unsorted.
  • Field names (here, "text") are repeated for each token. These could be indexes to the top-level fields property instead. Or, since the set of fields is bounded and small, the field indexes could be implied by list ordering.
  • Document references (such as "moby-dick.txt") are also repeated, and when combined with field names could be indexes into the fieldVectors property.

The following code applies these basic transformations to the invertedIndex property.

function indexToJSON(idx) {
	const output = idx.toJSON();
	output.invertedIndex = compactInvIndex(output);
	return output;
}

function compactInvIndex(index) {
	const fields = index["fields"];
	const fieldVectorIdxs = new Map(index["fieldVectors"].map((v, idx) => {
		return [v[0], idx];
	}));

	const items = new Map(index["invertedIndex"].map(item => {
		const token = item[0];
		const props = item[1];
		const newItem = [token];
		fields.forEach((field) => {
			const fProps = props[field];
			const matches = [];
			Object.keys(fProps).forEach((docRef) => {
				const fieldVectorIdx = fieldVectorIdxs.get(`${field}/${docRef}`);
				if (fieldVectorIdx === undefined) {
					throw new Error();
				}
				matches.push(fieldVectorIdx);
				matches.push(fProps[docRef]);
			});
			newItem.push(matches);
		});

		return [props["_index"], newItem];
	}));

	const indexes = Array.from(items.keys()).sort((a, b) => a - b);

	const compacted = Array.from(indexes, (k) => {
		const item = items.get(k);
		if (item === undefined) {
			throw new Error();
		}
		return item;
	});

	return compacted;
}

The raw (uncompressed) index size is reduced by over 50%, and compressed sizes by about 25%.

node lunr-index-v2.js moby-dick.txt pride-and-prejudice.txt > index-v2.json
[...]
ls -lS --si index-v2.*
# -rw-r--r-- 1 john john 751k May 27 16:17 index-v2.json
# -rw-r--r-- 1 john john 188k May 27 16:17 index-v2.json.gz
# -rw-r--r-- 1 john john 162k May 27 16:17 index-v2.json.zst
# -rw-r--r-- 1 john john 145k May 27 16:17 index-v2.json.br

Compacting the field vectors

The field vectors are semantically a key-value map, where the keys are integer indexes into the inverted index. There isn't much here to optimize in terms of plain text, but a simple tweak can improve how compressible the data is.

Consider a file that is simply a big list of integers: 1,2,3,4,6,7 and so on. A generic compression function can take advantage of the limited character set, but doesn't have the semantic understanding to encode "next integer in sequence". However, if the original file is changed such that sequential integers use a static sentinel value, then repetitions of the sentinel can be greatly compressed.

"fieldVectors": [
	[ "text/moby-dick.txt", [
		0, 0.607,
		1, 0.356,
		2, 0.382,
		// ...
		] ],
	[ "text/pride-and-prejudice.txt", [
		1, 0.278,
		2, 0.383,
		6, 0.278,
		// ...
		] ] ]

For Lunr indexes, because the inverted index is expanded as new tokens are observed, there are likely to be long runs of sequential integers in the first "column" of the field vectors. They can be replaced with null.

function indexToJSON(idx) {
	const output = idx.toJSON();
	output.invertedIndex = compactInvIndex(output);
	output.fieldVectors = compactVectors(output);
	return output;
}

function compactVectors(index) {
	return index["fieldVectors"].map((item) => {
		const id = item[0];
		const vectors = item[1];
		let prev = null;
		const compacted = vectors.map((v, ii) => {
			if (ii % 2 === 0) {
				if (prev !== null && v === prev + 1) {
					prev += 1;
					return null;
				}
				prev = v;
			}
			return v;
		});
		return [id, compacted];
	});
}

This optimization shaves another 20% to 25% from the compressed file sizes.

node lunr-index-v3.js moby-dick.txt pride-and-prejudice.txt > index-v3.json
[...]
ls -lS --si index-v3.*
# -rw-r--r-- 1 john john 741k May 27 16:27 index-v3.json
# -rw-r--r-- 1 john john 137k May 27 16:27 index-v3.json.gz
# -rw-r--r-- 1 john john 120k May 27 16:27 index-v3.json.zst
# -rw-r--r-- 1 john john 118k May 27 16:27 index-v3.json.br

Recovering the original index data

Lunr can't directly consume the compacted form of its search index, so we need to reverse the above optimizations before calling lunr.Index.load().

import * as fs from "fs";

function main(argv) {
	if (argv.length !== 3) {
		console.error("usage: lunr-index-expand FILE");
		process.exit(1);
	}
	const compactIndex = JSON.parse(fs.readFileSync(argv[2]));
	process.stdout.write(JSON.stringify(expand(compactIndex)));
}

function expand(compact) {
	const fields = compact["fields"];

	const fieldVectors = compact["fieldVectors"].map((item) => {
		const id = item[0];
		const vectors = item[1];
		let prev = null;
		const expanded = vectors.map((v, ii) => {
			if (ii % 2 === 0) {
				if (v === null) {
					v = prev + 1;
				}
				prev = v;
			}
			return v;
		});
		return [id, expanded];
	});

	const invertedIndex = compact["invertedIndex"].map((item, itemIdx) => {
		const token = item[0];
		const fieldMap = {"_index": itemIdx};
		fields.forEach((field, fieldIdx) => {
			const matches = {};

			let docRef = null;
			item[fieldIdx + 1].forEach((v, ii) => {
				if (ii % 2 === 0) {
					docRef = fieldVectors[v][0].slice(`${field}/`.length);
				} else {
					matches[docRef] = v;
				}
			});
			fieldMap[field] = matches;
		})
		return [token, fieldMap];
	});

	invertedIndex.sort((a, b) => {
		if (a[0] < b[0]) {
			return -1;
		}
		if (a[0] > b[0]) {
			return 1;
		}
		return 0;
	});

	return {
		"version": compact["version"],
		"fields": fields,
		"fieldVectors": fieldVectors,
		"invertedIndex": invertedIndex,
		"pipeline": compact["pipeline"],
	};
}

main(process.argv);

The expanded output is identical to the original search index JSON.

node lunr-index-expand.js index-v3.json > index-v3-expanded.json
shasum -a 256 index-v1.json index-v3-expanded.json
# a6f96e4046152213c0c41a12dc83522a85f91db6603c34cb8b85174efc3ade3f  index-v1.json
# a6f96e4046152213c0c41a12dc83522a85f91db6603c34cb8b85174efc3ade3f  index-v3-expanded.json
Change Feed