function [randomized_targets, permutation] = cosmo_randomize_targets(ds, varargin)
% provides randomized target labels
%
% randomized_targets=cosmo_randmize_targets(ds[,'seed',seed)
%
% Inputs:
% ds dataset struct with fields .sa.targets and
% .sa.chunks
% 'seed', seed (optional) if provided, use this seed value for
% pseudo-random number generation
%
%
% Outputs:
% randomized_targets P x 1 with randomized targets
% If ds defines a repeated-measures design (which
% requires that each chunk has the same set of
% unique targets), then targets are randomized
% separately for each chunk.
% Otherwise (when each chunk is associated with
% exactly one sample, i.e. all samples are
% independent), the targets are randomized
% without considering the chunk values.
% permutation P x 1 with indices of permutation. It holds that
% randomized_targets == ds.sa.targets(permutation).
%
% Example:
% % generate tiny dataset with 15 chunks, each with two targets
% ds=cosmo_synthetic_dataset('nchunks',15);
% % show number of samples with targets 1 or 2
% histc(ds.sa.targets',1:2)
% %|| [15 15]
% % generate randomized targets
% rand_targets=cosmo_randomize_targets(ds);
% % the number of samples with targets 1 or 2 is the same ...
% histc(rand_targets',1:2)
% %|| [15 15]
% % ... but the targets are re-ordered
% all(ds.sa.targets==rand_targets)
% %|| false
% %
% % when using the 'seed' option, the output is deterministic
% % (multiple calls to this function always give the same output)
% rand_targets_deterministic=cosmo_randomize_targets(ds,'seed',314);
% rand_targets_deterministic'
% %|| [ 2 1 1 2 2 1 2 1 2 1 2 1 1 2 2 1 1 2 2 1 1 2 2 1 2 1 2 1 2 1 ]
%
% # For CoSMoMVPA's copyright information and license terms, #
% # see the COPYING file distributed with CoSMoMVPA. #
opt = cosmo_structjoin(varargin);
[targets, chunks] = get_targets_and_chunks(ds);
nsamples = numel(targets);
[nunq_targets, unq_targets, target_idxs] = get_unique(targets);
[nunq_chunks, unq_chunks] = get_unique(chunks);
if nunq_chunks == nsamples
% between-subject design
rps = randperms_with_size(nsamples, opt);
permutation = rps{1};
else
% within-subject design
chunk_partition = cosmo_index_unique(chunks);
% count number of samples in each chunk, and ensure that no target
% is missing
nchunks = numel(chunk_partition);
samples_per_chunk = zeros(1, nchunks);
for k = 1:nchunks
sample_idxs = chunk_partition{k};
h = histc(target_idxs(sample_idxs), 1:nunq_targets);
if any(h == 0)
i = find(h == 0, 1);
error(['.sa.chunks and .sa.targets suggest a repeated '...
'measure design, but chunk %d has missing '...
'target %d'], ...
unq_chunks(k), unq_targets(i));
end
samples_per_chunk(k) = numel(sample_idxs);
end
% do permutation in each chunk separately
rps = randperms_with_size(samples_per_chunk, opt);
permutation = zeros(nsamples, 1);
for k = 1:nchunks
rp = rps{k};
sample_idxs = chunk_partition{k};
permutation(sample_idxs) = sample_idxs(rp);
end
end
randomized_targets = targets(permutation);
function rps = randperms_with_size(sizes, opt)
% helper function
% Input: sizes is 1xN vector
% Output: rps is 1xN cell; rps{k} is a random permutation of 1:sizes(k)
%
cum_size = sum(sizes);
% single call to cosmo_rand, because this call is computationally
% expensive
if isfield(opt, 'seed')
r = cosmo_rand(1, cum_size, 'seed', opt.seed);
else
r = cosmo_rand(1, cum_size);
end
n = numel(sizes);
rps = cell(1, n);
first_pos = 1;
for k = 1:n
last_pos = first_pos + sizes(k) - 1;
% get sizes(k) random values
r_part = r(first_pos:last_pos);
% get sorting indices to get random permutation of 1:sizes(k)
[unused, rps{k}] = sort(r_part);
% for next iteration
first_pos = last_pos + 1;
end
function [n, unq, idxs] = get_unique(xs)
[unq, unused, idxs] = unique(xs);
n = numel(unq);
function [targets, chunks] = get_targets_and_chunks(ds)
if ~isfield(ds, 'sa') || ...
~isfield(ds.sa, 'chunks') || ~isfield(ds.sa, 'targets')
error('dataset must have .sa.chunks and .sa.targets');
end
targets = ds.sa.targets;
chunks = ds.sa.chunks;