274 lines
7 KiB
JavaScript
274 lines
7 KiB
JavaScript
"use strict";
|
|
|
|
// Simulations show these probabilities for a single change
|
|
// 93.1% that one group is invalidated
|
|
// 4.8% that two groups are invalidated
|
|
// 1.1% that 3 groups are invalidated
|
|
// 0.1% that 4 or more groups are invalidated
|
|
//
|
|
// And these for removing/adding 10 lexically adjacent files
|
|
// 64.5% that one group is invalidated
|
|
// 24.8% that two groups are invalidated
|
|
// 7.8% that 3 groups are invalidated
|
|
// 2.7% that 4 or more groups are invalidated
|
|
//
|
|
// And these for removing/adding 3 random files
|
|
// 0% that one group is invalidated
|
|
// 3.7% that two groups are invalidated
|
|
// 80.8% that 3 groups are invalidated
|
|
// 12.3% that 4 groups are invalidated
|
|
// 3.2% that 5 or more groups are invalidated
|
|
|
|
/**
|
|
*
|
|
* @param {string} a key
|
|
* @param {string} b key
|
|
* @returns {number} the similarity as number
|
|
*/
|
|
const similarity = (a, b) => {
|
|
const l = Math.min(a.length, b.length);
|
|
let dist = 0;
|
|
for (let i = 0; i < l; i++) {
|
|
const ca = a.charCodeAt(i);
|
|
const cb = b.charCodeAt(i);
|
|
dist += Math.max(0, 10 - Math.abs(ca - cb));
|
|
}
|
|
return dist;
|
|
};
|
|
|
|
/**
|
|
* @param {string} a key
|
|
* @param {string} b key
|
|
* @returns {string} the common part and a single char for the difference
|
|
*/
|
|
const getName = (a, b) => {
|
|
const l = Math.min(a.length, b.length);
|
|
let r = "";
|
|
for (let i = 0; i < l; i++) {
|
|
const ca = a.charAt(i);
|
|
const cb = b.charAt(i);
|
|
r += ca;
|
|
if (ca === cb) {
|
|
continue;
|
|
}
|
|
return r;
|
|
}
|
|
return a;
|
|
};
|
|
|
|
/**
|
|
* @template T
|
|
*/
|
|
class Node {
|
|
/**
|
|
* @param {T} item item
|
|
* @param {string} key key
|
|
* @param {number} size size
|
|
*/
|
|
constructor(item, key, size) {
|
|
this.item = item;
|
|
this.key = key;
|
|
this.size = size;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @template T
|
|
*/
|
|
class Group {
|
|
/**
|
|
* @param {Node<T>[]} nodes nodes
|
|
* @param {number[]} similarities similarities between the nodes (length = nodes.length - 1)
|
|
*/
|
|
constructor(nodes, similarities) {
|
|
this.nodes = nodes;
|
|
this.similarities = similarities;
|
|
this.size = nodes.reduce((size, node) => size + node.size, 0);
|
|
/** @type {string} */
|
|
this.key = undefined;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @template T
|
|
* @typedef {Object} GroupedItems<T>
|
|
* @property {string} key
|
|
* @property {T[]} items
|
|
* @property {number} size
|
|
*/
|
|
|
|
/**
|
|
* @template T
|
|
* @typedef {Object} Options
|
|
* @property {number} maxSize maximum size of a group
|
|
* @property {number} minSize minimum size of a group (preferred over maximum size)
|
|
* @property {Iterable<T>} items a list of items
|
|
* @property {function(T): number} getSize function to get size of an item
|
|
* @property {function(T): string} getKey function to get the key of an item
|
|
*/
|
|
|
|
/**
|
|
* @template T
|
|
* @param {Options<T>} options options object
|
|
* @returns {GroupedItems<T>[]} grouped items
|
|
*/
|
|
module.exports = ({ maxSize, minSize, items, getSize, getKey }) => {
|
|
/** @type {Group<T>[]} */
|
|
const result = [];
|
|
|
|
const nodes = Array.from(
|
|
items,
|
|
item => new Node(item, getKey(item), getSize(item))
|
|
);
|
|
|
|
/** @type {Node<T>[]} */
|
|
const initialNodes = [];
|
|
|
|
// lexically ordering of keys
|
|
nodes.sort((a, b) => {
|
|
if (a.key < b.key) return -1;
|
|
if (a.key > b.key) return 1;
|
|
return 0;
|
|
});
|
|
|
|
// return nodes bigger than maxSize directly as group
|
|
for (const node of nodes) {
|
|
if (node.size >= maxSize) {
|
|
result.push(new Group([node], []));
|
|
} else {
|
|
initialNodes.push(node);
|
|
}
|
|
}
|
|
|
|
if (initialNodes.length > 0) {
|
|
// calculate similarities between lexically adjacent nodes
|
|
/** @type {number[]} */
|
|
const similarities = [];
|
|
for (let i = 1; i < initialNodes.length; i++) {
|
|
const a = initialNodes[i - 1];
|
|
const b = initialNodes[i];
|
|
similarities.push(similarity(a.key, b.key));
|
|
}
|
|
|
|
const initialGroup = new Group(initialNodes, similarities);
|
|
|
|
if (initialGroup.size < minSize) {
|
|
// We hit an edgecase where the working set is already smaller than minSize
|
|
// We merge it with the smallest result node to keep minSize intact
|
|
if (result.length > 0) {
|
|
const smallestGroup = result.reduce((min, group) =>
|
|
min.size > group.size ? group : min
|
|
);
|
|
for (const node of initialGroup.nodes) smallestGroup.nodes.push(node);
|
|
smallestGroup.nodes.sort((a, b) => {
|
|
if (a.key < b.key) return -1;
|
|
if (a.key > b.key) return 1;
|
|
return 0;
|
|
});
|
|
} else {
|
|
// There are no other nodes
|
|
// We use all nodes and have to accept that it's smaller than minSize
|
|
result.push(initialGroup);
|
|
}
|
|
} else {
|
|
const queue = [initialGroup];
|
|
|
|
while (queue.length) {
|
|
const group = queue.pop();
|
|
// only groups bigger than maxSize need to be splitted
|
|
if (group.size < maxSize) {
|
|
result.push(group);
|
|
continue;
|
|
}
|
|
|
|
// find unsplittable area from left and right
|
|
// going minSize from left and right
|
|
// at least one node need to be included otherwise we get stuck
|
|
let left = 0;
|
|
let leftSize = 0;
|
|
while (leftSize <= minSize) {
|
|
leftSize += group.nodes[left].size;
|
|
left++;
|
|
}
|
|
let right = group.nodes.length - 1;
|
|
let rightSize = 0;
|
|
while (rightSize <= minSize) {
|
|
rightSize += group.nodes[right].size;
|
|
right--;
|
|
}
|
|
|
|
if (left - 1 > right) {
|
|
// can't split group while holding minSize
|
|
// because minSize is preferred of maxSize we return
|
|
// the group here even while it's too big
|
|
// To avoid this make sure maxSize > minSize * 3
|
|
result.push(group);
|
|
continue;
|
|
}
|
|
if (left <= right) {
|
|
// when there is a area between left and right
|
|
// we look for best split point
|
|
// we split at the minimum similarity
|
|
// here key space is separated the most
|
|
let best = left - 1;
|
|
let bestSimilarity = group.similarities[best];
|
|
for (let i = left; i <= right; i++) {
|
|
const similarity = group.similarities[i];
|
|
if (similarity < bestSimilarity) {
|
|
best = i;
|
|
bestSimilarity = similarity;
|
|
}
|
|
}
|
|
left = best + 1;
|
|
right = best;
|
|
}
|
|
|
|
// create two new groups for left and right area
|
|
// and queue them up
|
|
const rightNodes = [group.nodes[right + 1]];
|
|
/** @type {number[]} */
|
|
const rightSimilaries = [];
|
|
for (let i = right + 2; i < group.nodes.length; i++) {
|
|
rightSimilaries.push(group.similarities[i - 1]);
|
|
rightNodes.push(group.nodes[i]);
|
|
}
|
|
queue.push(new Group(rightNodes, rightSimilaries));
|
|
|
|
const leftNodes = [group.nodes[0]];
|
|
/** @type {number[]} */
|
|
const leftSimilaries = [];
|
|
for (let i = 1; i < left; i++) {
|
|
leftSimilaries.push(group.similarities[i - 1]);
|
|
leftNodes.push(group.nodes[i]);
|
|
}
|
|
queue.push(new Group(leftNodes, leftSimilaries));
|
|
}
|
|
}
|
|
}
|
|
|
|
// lexically ordering
|
|
result.sort((a, b) => {
|
|
if (a.nodes[0].key < b.nodes[0].key) return -1;
|
|
if (a.nodes[0].key > b.nodes[0].key) return 1;
|
|
return 0;
|
|
});
|
|
|
|
// give every group a name
|
|
for (let i = 0; i < result.length; i++) {
|
|
const group = result[i];
|
|
const first = group.nodes[0];
|
|
const last = group.nodes[group.nodes.length - 1];
|
|
let name = getName(first.key, last.key);
|
|
group.key = name;
|
|
}
|
|
|
|
// return the results
|
|
return result.map(group => {
|
|
/** @type {GroupedItems} */
|
|
return {
|
|
key: group.key,
|
|
items: group.nodes.map(node => node.item),
|
|
size: group.size
|
|
};
|
|
});
|
|
};
|