function test_suite = test_chunkize
% tests for cosmo_chunkize
%
% # For CoSMoMVPA's copyright information and license terms, #
% # see the COPYING file distributed with CoSMoMVPA. #
try % assignment of 'localfunctions' is necessary in Matlab >= 2016
test_functions = localfunctions();
catch % no problem; early Matlab versions can use initTestSuite fine
end
initTestSuite;
function test_chunkize_basis
ds = cosmo_synthetic_dataset('type', 'timelock', 'nreps', 8);
ds.sa.chunks = reshape(repmat(1:8, 6, 1), [], 1);
ds = cosmo_slice(ds, randperm(48));
chunks = cosmo_chunkize(ds, 8);
assertEqual(chunks, ds.sa.chunks);
for j = 1:2:7
chunks = cosmo_chunkize(ds, j);
eq_chunks = bsxfun(@eq, chunks, chunks');
eq_ds = bsxfun(@eq, ds.sa.chunks, ds.sa.chunks');
m = eq_ds & ~eq_chunks;
assert(~any(m(:)));
end
assertExceptionThrown(@()cosmo_chunkize(ds, 9), '');
ds = rmfield(ds.sa, 'chunks');
assertExceptionThrown(@()cosmo_chunkize(ds, 2), '');
function test_chunkize_imbalance()
ds = struct();
ds.samples = (1:5)';
assertExceptionThrown(@()cosmo_chunkize(ds, 2), '');
ds.sa.chunks = 2 + [1 1 2 2 2]';
assertExceptionThrown(@()cosmo_chunkize(ds, 2), '');
ds.sa.targets = 10 + [1 2 1 2 2]';
assertExceptionThrown(@()cosmo_chunkize(ds, 3), '');
count = 2;
res = cosmo_chunkize(ds, count);
assert_chunkize_ok(ds, res, count);
ds2 = cosmo_stack({ds, ds});
res2 = cosmo_chunkize(ds2, count);
assert_chunkize_ok(ds2, res2, count);
function test_all_unique_chunks_tiny()
ds = struct();
ds.samples = (1:5)';
ds.sa.targets = 2 + [1 1 2 2 2]';
ds.sa.chunks = 10 + [1 2 3 4 5]';
for count = 2:5
res = cosmo_chunkize(ds, count);
assert_chunkize_ok(ds, res, count);
end
assertExceptionThrown(@()cosmo_chunkize(ds, 6), '');
function test_chunkize_very_unbalanced_chunks_big()
% all chunks are unique, want a similar number of targets in each
% output chunk
ds = cosmo_synthetic_dataset('nreps', 6, 'ntargets', 5);
nsamples = size(ds.samples, 1);
ds.sa.chunks(:) = repmat(1:nsamples / 10, 1, 10);
n_combis = max(ds.sa.chunks) * max(ds.sa.targets);
targets = ds.sa.targets;
n_swap = 5;
while true
rp = randperm(nsamples);
ds.sa.targets = targets;
ds.sa.targets(rp(1:n_swap)) = ds.sa.targets(rp(n_swap:-1:1));
idxs = cosmo_index_unique({ds.sa.targets, ds.sa.chunks});
n = cellfun(@numel, idxs);
if min(n) >= 1 && max(n) <= 3 && std(n) < .1 && numel(n) == n_combis
% not too unbalanced
break
end
end
nchunks = ceil(3 + rand() * 4);
res = cosmo_chunkize(ds, nchunks);
assert_chunkize_ok(ds, res, nchunks);
function test_chunkize_slight_unbalanced_chunks_big()
% all chunks are unique, want a similar number of targets in each
% output chunk
ds = cosmo_synthetic_dataset('nreps', 6, 'ntargets', 5);
nsamples = size(ds.samples, 1);
ds.sa.chunks(:) = repmat(1:nsamples / 2, 1, 2);
ds.sa.targets(1:5) = ds.sa.targets(2:6); % slight imbalance
nchunks = ceil(rand() * 5);
res = cosmo_chunkize(ds, nchunks);
assert_chunkize_ok(ds, res, nchunks);
function test_chunkize_all_unique_independent_chunks()
% each sample has its own unique chunk value
ds = cosmo_synthetic_dataset('ntargets', 2, 'nchunks', 6 * 6);
nsamples = size(ds.samples, 1);
ds.sa.chunks(:) = ceil(rand() * 10) + (1:nsamples);
ds.sa.targets = ds.sa.targets(randperm(nsamples));
nchunks_candidates = [1 2 3 4 6 12 18];
for nchunks = nchunks_candidates
chunks = cosmo_chunkize(ds, nchunks);
assert_chunkize_ok(ds, chunks, nchunks);
idxs = cosmo_index_unique([ds.sa.targets chunks]);
n = cellfun(@numel, idxs);
% require full balance
assert(all(n(1) == n(2:end)));
end
function test_chunkize_dependent_balanced_chunks()
% each combination of chunks and targets occurs equally often
ntargets = ceil(2 + rand() * 4);
nreps = ceil(2 + rand() * 4);
nchunks = 36;
ds = cosmo_synthetic_dataset('ntargets', ntargets, ...
'nchunks', nchunks, 'nreps', nreps);
nsamples = size(ds.samples, 1);
ds = cosmo_slice(ds, randperm(nsamples));
rep_idxs = cosmo_index_unique({ds.sa.chunks, ds.sa.targets});
assert(all(cellfun(@numel, rep_idxs) == nreps));
nchunks_candidates = [1 2 3 4 6 12 18];
for nchunks = nchunks_candidates
chunks = cosmo_chunkize(ds, nchunks);
assert_chunkize_ok(ds, chunks, nchunks);
idxs = cosmo_index_unique([ds.sa.targets chunks]);
n = cellfun(@numel, idxs);
% require full balance
assert(all(n(1) == n(2:end)));
end
function assert_chunkize_ok(src_ds, chunks, count)
% number of items must match input dataset
assertEqual(numel(src_ds.sa.chunks), numel(chunks));
% must be balanced
assert_chunks_targets_balanced(src_ds, chunks);
% cannot have double dipping
assert_no_double_dipping(src_ds, chunks);
% must have the proper number of chunks
assertEqual(numel(unique(chunks)), count);
assert_chunks_targets_nonzero(src_ds, chunks);
function assert_chunks_targets_balanced(src_ds, chunks)
idxs = cosmo_index_unique([src_ds.sa.targets chunks]);
n = cellfun(@numel, idxs);
% cannot test for 'optimal' balance due to combinatorial explosion;
% this is a decent approach to make sure that chunks are not too
% imbalanced
assert(std(n) <= 1.5);
assert(min(n) + 2 >= max(n));
function assert_chunks_targets_nonzero(src_ds, chunks)
[unused, unused, t_idxs] = unique(src_ds.sa.targets);
[unused, unused, c_idxs] = unique(chunks);
nt = max(t_idxs);
nc = max(c_idxs);
h = zeros(nt, nc);
ns = numel(chunks);
for k = 1:ns
t = t_idxs(k);
c = c_idxs(k);
h(t, c) = h(t, c) + 1;
end
assert(all(max(h, [], 1) > 0));
assert(all(max(h, [], 2) > 0));
function assert_no_double_dipping(src_ds, chunks)
% samples that were in different chunks in src_ds must not be in the
% same chunk in trg_ds
[unq_src, unused, src_ids] = unique(src_ds.sa.chunks);
[unq_trg, unused, trg_ids] = unique(chunks);
n_src = numel(unq_src);
n_trg = numel(unq_trg);
n_samples = numel(src_ds.sa.chunks);
chunk_count = zeros(n_src, n_trg);
for k = 1:n_samples
i = src_ids(k);
j = trg_ids(k);
chunk_count(i, j) = chunk_count(i, j) + 1;
end
assert(all(sum(chunk_count > 0, 2) == 1));