deno.land / x / simplestatistic@v7.7.1 / src / ckmeans.js

نووسراو ببینە
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import makeMatrix from "./make_matrix";import numericSort from "./numeric_sort";import uniqueCountSorted from "./unique_count_sorted";
/** * Generates incrementally computed values based on the sums and sums of * squares for the data array * * @private * @param {number} j * @param {number} i * @param {Array<number>} sums * @param {Array<number>} sumsOfSquares * @return {number} * @example * ssq(0, 1, [-1, 0, 2], [1, 1, 5]); */function ssq(j, i, sums, sumsOfSquares) { let sji; // s(j, i) if (j > 0) { const muji = (sums[i] - sums[j - 1]) / (i - j + 1); // mu(j, i) sji = sumsOfSquares[i] - sumsOfSquares[j - 1] - (i - j + 1) * muji * muji; } else { sji = sumsOfSquares[i] - (sums[i] * sums[i]) / (i + 1); } if (sji < 0) { return 0; } return sji;}
/** * Function that recursively divides and conquers computations * for cluster j * * @private * @param {number} iMin Minimum index in cluster to be computed * @param {number} iMax Maximum index in cluster to be computed * @param {number} cluster Index of the cluster currently being computed * @param {Array<Array<number>>} matrix * @param {Array<Array<number>>} backtrackMatrix * @param {Array<number>} sums * @param {Array<number>} sumsOfSquares */function fillMatrixColumn( iMin, iMax, cluster, matrix, backtrackMatrix, sums, sumsOfSquares) { if (iMin > iMax) { return; }
// Start at midpoint between iMin and iMax const i = Math.floor((iMin + iMax) / 2);
matrix[cluster][i] = matrix[cluster - 1][i - 1]; backtrackMatrix[cluster][i] = i;
let jlow = cluster; // the lower end for j
if (iMin > cluster) { jlow = Math.max(jlow, backtrackMatrix[cluster][iMin - 1] || 0); } jlow = Math.max(jlow, backtrackMatrix[cluster - 1][i] || 0);
let jhigh = i - 1; // the upper end for j if (iMax < matrix[0].length - 1) { jhigh = Math.min(jhigh, backtrackMatrix[cluster][iMax + 1] || 0); }
let sji; let sjlowi; let ssqjlow; let ssqj; for (let j = jhigh; j >= jlow; --j) { sji = ssq(j, i, sums, sumsOfSquares);
if (sji + matrix[cluster - 1][jlow - 1] >= matrix[cluster][i]) { break; }
// Examine the lower bound of the cluster border sjlowi = ssq(jlow, i, sums, sumsOfSquares);
ssqjlow = sjlowi + matrix[cluster - 1][jlow - 1];
if (ssqjlow < matrix[cluster][i]) { // Shrink the lower bound matrix[cluster][i] = ssqjlow; backtrackMatrix[cluster][i] = jlow; } jlow++;
ssqj = sji + matrix[cluster - 1][j - 1]; if (ssqj < matrix[cluster][i]) { matrix[cluster][i] = ssqj; backtrackMatrix[cluster][i] = j; } }
fillMatrixColumn( iMin, i - 1, cluster, matrix, backtrackMatrix, sums, sumsOfSquares ); fillMatrixColumn( i + 1, iMax, cluster, matrix, backtrackMatrix, sums, sumsOfSquares );}
/** * Initializes the main matrices used in Ckmeans and kicks * off the divide and conquer cluster computation strategy * * @private * @param {Array<number>} data sorted array of values * @param {Array<Array<number>>} matrix * @param {Array<Array<number>>} backtrackMatrix */function fillMatrices(data, matrix, backtrackMatrix) { const nValues = matrix[0].length;
// Shift values by the median to improve numeric stability const shift = data[Math.floor(nValues / 2)];
// Cumulative sum and cumulative sum of squares for all values in data array const sums = []; const sumsOfSquares = [];
// Initialize first column in matrix & backtrackMatrix for (let i = 0, shiftedValue; i < nValues; ++i) { shiftedValue = data[i] - shift; if (i === 0) { sums.push(shiftedValue); sumsOfSquares.push(shiftedValue * shiftedValue); } else { sums.push(sums[i - 1] + shiftedValue); sumsOfSquares.push( sumsOfSquares[i - 1] + shiftedValue * shiftedValue ); }
// Initialize for cluster = 0 matrix[0][i] = ssq(0, i, sums, sumsOfSquares); backtrackMatrix[0][i] = 0; }
// Initialize the rest of the columns let iMin; for (let cluster = 1; cluster < matrix.length; ++cluster) { if (cluster < matrix.length - 1) { iMin = cluster; } else { // No need to compute matrix[K-1][0] ... matrix[K-1][N-2] iMin = nValues - 1; }
fillMatrixColumn( iMin, nValues - 1, cluster, matrix, backtrackMatrix, sums, sumsOfSquares ); }}
/** * Ckmeans clustering is an improvement on heuristic-based clustering * approaches like Jenks. The algorithm was developed in * [Haizhou Wang and Mingzhou Song](http://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf) * as a [dynamic programming](https://en.wikipedia.org/wiki/Dynamic_programming) approach * to the problem of clustering numeric data into groups with the least * within-group sum-of-squared-deviations. * * Minimizing the difference within groups - what Wang & Song refer to as * `withinss`, or within sum-of-squares, means that groups are optimally * homogenous within and the data is split into representative groups. * This is very useful for visualization, where you may want to represent * a continuous variable in discrete color or style groups. This function * can provide groups that emphasize differences between data. * * Being a dynamic approach, this algorithm is based on two matrices that * store incrementally-computed values for squared deviations and backtracking * indexes. * * This implementation is based on Ckmeans 3.4.6, which introduced a new divide * and conquer approach that improved runtime from O(kn^2) to O(kn log(n)). * * Unlike the [original implementation](https://cran.r-project.org/web/packages/Ckmeans.1d.dp/index.html), * this implementation does not include any code to automatically determine * the optimal number of clusters: this information needs to be explicitly * provided. * * ### References * _Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic * Programming_ Haizhou Wang and Mingzhou Song ISSN 2073-4859 * * from The R Journal Vol. 3/2, December 2011 * @param {Array<number>} x input data, as an array of number values * @param {number} nClusters number of desired classes. This cannot be * greater than the number of values in the data array. * @returns {Array<Array<number>>} clustered input * @throws {Error} if the number of requested clusters is higher than the size of the data * @example * ckmeans([-1, 2, -1, 2, 4, 5, 6, -1, 2, -1], 3); * // The input, clustered into groups of similar numbers. * //= [[-1, -1, -1, -1], [2, 2, 2], [4, 5, 6]]); */function ckmeans(x, nClusters) { if (nClusters > x.length) { throw new Error( "cannot generate more classes than there are data values" ); }
const sorted = numericSort(x); // we'll use this as the maximum number of clusters const uniqueCount = uniqueCountSorted(sorted);
// if all of the input values are identical, there's one cluster // with all of the input in it. if (uniqueCount === 1) { return [sorted]; }
// named 'S' originally const matrix = makeMatrix(nClusters, sorted.length); // named 'J' originally const backtrackMatrix = makeMatrix(nClusters, sorted.length);
// This is a dynamic programming way to solve the problem of minimizing // within-cluster sum of squares. It's similar to linear regression // in this way, and this calculation incrementally computes the // sum of squares that are later read. fillMatrices(sorted, matrix, backtrackMatrix);
// The real work of Ckmeans clustering happens in the matrix generation: // the generated matrices encode all possible clustering combinations, and // once they're generated we can solve for the best clustering groups // very quickly. const clusters = []; let clusterRight = backtrackMatrix[0].length - 1;
// Backtrack the clusters from the dynamic programming matrix. This // starts at the bottom-right corner of the matrix (if the top-left is 0, 0), // and moves the cluster target with the loop. for (let cluster = backtrackMatrix.length - 1; cluster >= 0; cluster--) { const clusterLeft = backtrackMatrix[cluster][clusterRight];
// fill the cluster from the sorted input by taking a slice of the // array. the backtrack matrix makes this easy - it stores the // indexes where the cluster should start and end. clusters[cluster] = sorted.slice(clusterLeft, clusterRight + 1);
if (cluster > 0) { clusterRight = clusterLeft - 1; } }
return clusters;}
export default ckmeans;
simplestatistic

Version Info

Tagged at
2 years ago