cosmo nchoosek partitioner

function partitions = cosmo_nchoosek_partitioner(chunks_or_ds, k, varargin)
% partitions for into nchoosek(n,k) parititions with optional grouping schemas.
%
% partitions=cosmo_nchoosek_partitioner(chunks, k, group_values1, test_group_by1,...)
%
% Input
%  - chunks          Px1 chunk indices for P samples. It can also be a
%                    dataset struct with field .sa.chunks
%  - k               When an integer, k chunks are in each test partition.
%                    When between 0 and 1, this is interpreted as
%                    round(k*nchunks) where nchunks is the number of unique
%                    chunks in chunks.
%                    A special case, mostly aimed at split-half
%                    correlations, is when k=='half'; this sets k to .5,
%                    and if k is even, it treats train_indices and
%                    test_indices as symmetrical, meaning it returns only
%                    half the number of partitions (avoiding duplicates).
%                    If k is odd then train and test indices have
%                    (k+1)/nchunks and (k-1)/nchunks elements,
%                    respectively, or vice versa.
%  - group_values*   } Intended for cross-participant or cross-condition
%  - test_group_by*  } generalizability analyses.
%                      Pairs of these determine a subsequent level group
%                      partition scheme. Each group_values can be
%                      a cell with the labels for a test group, or,
%                      if chunks is a string, the name of a sample
%                      attribute whose values are used as labels.
%                      Each test_group_by indicates which values in
%                      group_values are used as a test value in the
%                      cross-validation scheme (and all other ones are used
%                      as training value). If empty it is set to the unique
%                      values in group_values.
%
% Output:
%  - partitions      A struct with fields:
%     .train_indices } Each of these is an 1xQ cell for Q partitions, where
%     .test_indices  } Q=nchoosek(nchunks,k)). It contains all possible
%                      combinations with k test indices and (nchunks-k)
%                      training indices (except when k=='half', see above).
%                    If group_values* and test_group_by* are specified,
%                    then the number of output partitions is multiplied
%                    by the product of the number of values in
%                    test_group_by*.
%
%
% Examples:
%     % make a simple dataset with 4 chunks, 2 samples each
%     % assume two targets (i.e. conditions, say piano versus guitar)
%     ds=struct();
%     ds.samples=randn(8,99); % 8 samples, 99
%     ds.sa.targets=[1 1 1 1 2 2 2 2]';
%     ds.sa.chunks=2+[1 2 3 4 4 3 2 1]';
%     cosmo_check_dataset(ds); % sanity check
%     %
%     % take-one-chunk out partitioning
%     p=cosmo_nchoosek_partitioner(ds,1);
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [ 2    [ 1    [ 1    [ 1
%     %||       3      3      2      2
%     %||       4      4      4      3
%     %||       5      5      5      6
%     %||       6      6      7      7
%     %||       7 ]    8 ]    8 ]    8 ] }
%     %|| .test_indices
%     %||   { [ 1    [ 2    [ 3    [ 4
%     %||       8 ]    7 ]    6 ]    5 ] }
%     %
%     % take-two chunks out partitioning
%     p=cosmo_nchoosek_partitioner(ds,2);
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [ 3    [ 2    [ 2    [ 1    [ 1    [ 1
%     %||       4      4      3      4      3      2
%     %||       5      5      6      5      6      7
%     %||       6 ]    7 ]    7 ]    8 ]    8 ]    8 ] }
%     %|| .test_indices
%     %||   { [ 1    [ 1    [ 1    [ 2    [ 2    [ 3
%     %||       2      3      4      3      4      4
%     %||       7      6      5      6      5      5
%     %||       8 ]    8 ]    8 ]    7 ]    7 ]    6 ] }
%     %
%     % take-half-of-the-chunks out partitioning
%     % (this effectively gives same chunks as above)
%     p_alt=cosmo_nchoosek_partitioner(ds,.5);
%     isequal(p, p_alt)
%     %|| true
%     %
%     % do half split (for correlation measure); this leaves out
%     % mirror partitions of train and test indices
%     p=cosmo_nchoosek_partitioner(ds,'half');
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [ 1    [ 1    [ 1
%     %||       2      3      4
%     %||       7      6      5
%     %||       8 ]    8 ]    8 ] }
%     %|| .test_indices
%     %||   { [ 3    [ 2    [ 2
%     %||       4      4      3
%     %||       5      5      6
%     %||       6 ]    7 ]    7 ] }
%     %
%     % test on samples with chunk=3 only using take-one-fold out
%     p=cosmo_nchoosek_partitioner(ds,1,'chunks',3);
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [ 2
%     %||       3
%     %||       4
%     %||       5
%     %||       6
%     %||       7 ] }
%     %|| .test_indices
%     %||   { [ 1
%     %||       8 ] }
%     % test on samples with chunk=[3 4] only using take-one-fold out;
%     % only samples with chunks=1 or 2 are used for training
%     p=cosmo_nchoosek_partitioner(ds,1,'chunks',{[3 4]});
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [ 3    [ 3
%     %||       4      4
%     %||       5      5
%     %||       6 ]    6 ] }
%     %|| .test_indices
%     %||   { [ 1    [ 2
%     %||       8 ]    7 ] }
%     % test separately on samples with chunk=3 and samples with chunk=4;
%     % in some folds, samples with chunks=1,2,4 are used for training, in
%     % other folds samples with chunks=1,2,3 are used for training
%     p=cosmo_nchoosek_partitioner(ds,1,'chunks',{[3],[4]});
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [ 2    [ 1
%     %||       3      3
%     %||       4      4
%     %||       5      5
%     %||       6      6
%     %||       7 ]    8 ] }
%     %|| .test_indices
%     %||   { [ 1    [ 2
%     %||       8 ]    7 ] }
%     %||
%
%
%     % make a slightly more complicated dataset: with three chunks,
%     % suppose there are two modalities (e.g. (1) visual and (2)
%     % auditory stimulation) which are stored in an
%     % additional field 'modality'
%     ds=struct();
%     ds.samples=randn(12,99);
%     ds.sa.chunks  =[1 1 1 1 2 2 2 2 3 3 3 3]';
%     ds.sa.targets =[1 2 1 2 1 2 1 2 1 2 1 2]';
%     ds.sa.modality=[1 1 2 2 1 1 2 2 1 1 2 2]';
%     cosmo_check_dataset(ds);
%     %
%     % take-one-chunk out, test on samples with modality=1 (and train
%     % on samples with other modalities, i.e. modality=2)
%     p=cosmo_nchoosek_partitioner(ds,1,'modality',1);
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [  7    [  3    [ 3
%     %||        8       4      4
%     %||       11      11      7
%     %||       12 ]    12 ]    8 ] }
%     %|| .test_indices
%     %||   { [ 1    [ 5    [  9
%     %||       2 ]    6 ]    10 ] }
%     %
%     % take-one-chunk out, test on samples with modality=2
%     p=cosmo_nchoosek_partitioner(ds,1,'modality',2);
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [  5    [  1    [ 1
%     %||        6       2      2
%     %||        9       9      5
%     %||       10 ]    10 ]    6 ] }
%     %|| .test_indices
%     %||   { [ 3    [ 7    [ 11
%     %||       4 ]    8 ]    12 ] }
%     % take-one-chunk out, test on samples with modality=1 (and train on
%     % modality=2) and vice verse
%     p=cosmo_nchoosek_partitioner(ds,1,'modality',[]);
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [  7    [  3    [ 3    [  5    [  1    [ 1
%     %||        8       4      4       6       2      2
%     %||       11      11      7       9       9      5
%     %||       12 ]    12 ]    8 ]    10 ]    10 ]    6 ] }
%     %|| .test_indices
%     %||   { [ 1    [ 5    [  9    [ 3    [ 7    [ 11
%     %||       2 ]    6 ]    10 ]    4 ]    8 ]    12 ] }
%
%     % between-subject classification: 3 chunks, 2 modalities, 5 subjects
%     ds=struct();
%     ds.samples=randn(60,99);
%     ds.sa.targets=repmat([1 2],1,30)';
%     ds.sa.chunks=repmat([1 1 1 1 2 2 2 2 3 3 3 3],1,5)';
%     ds.sa.modality=repmat([1 1 2 2],1,15)';
%     ds.sa.subject=kron(1:5,ones(1,12))';
%     cosmo_check_dataset(ds);
%     %
%     % test on subject 1, train on other subjects using take-one-chunk out
%     p=cosmo_nchoosek_partitioner(ds,1,'subject',1);
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [ 17         [ 13         [ 13
%     %||       18           14           14
%     %||       19           15           15
%     %||        :            :            :
%     %||       58           58           54
%     %||       59           59           55
%     %||       60 ]@32x1    60 ]@32x1    56 ]@32x1 }
%     %|| .test_indices
%     %||   { [ 1    [ 5    [  9
%     %||       2      6      10
%     %||       3      7      11
%     %||       4 ]    8 ]    12 ] }
%     %
%     % test on each subject after training on each other subject
%     % in each fold, the test data is from one subject and one chunk,
%     % and the train data from all other subjects and all other chunks.
%     % since there are 5 subjects and 3 chunks, there are 15 folds.
%     p=cosmo_nchoosek_partitioner(ds,1,'subject',[]);
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [ 17         [ 13         [ 13        ... [  5         [  1         [  1
%     %||       18           14           14               6            2            2
%     %||       19           15           15               7            3            3
%     %||        :            :            :               :            :            :
%     %||       58           58           54              46           46           42
%     %||       59           59           55              47           47           43
%     %||       60 ]@32x1    60 ]@32x1    56 ]@32x1       48 ]@32x1    48 ]@32x1    44 ]@32x1   }@1x15
%     %|| .test_indices
%     %||   { [ 1    [ 5    [  9   ... [ 49    [ 53    [ 57
%     %||       2      6      10         50      54      58
%     %||       3      7      11         51      55      59
%     %||       4 ]    8 ]    12 ]       52 ]    56 ]    60 ]   }@1x15
%     %
%     % as above, but test on modality=2 (and train on other values for
%     % modality, i.e. modality=1)
%     p=cosmo_nchoosek_partitioner(ds,1,'subject',[],'modality',2);
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [ 17         [ 13         [ 13        ... [  5         [  1         [  1
%     %||       18           14           14               6            2            2
%     %||       21           21           17               9            9            5
%     %||        :            :            :               :            :            :
%     %||       54           50           50              42           38           38
%     %||       57           57           53              45           45           41
%     %||       58 ]@16x1    58 ]@16x1    54 ]@16x1       46 ]@16x1    46 ]@16x1    42 ]@16x1   }@1x15
%     %|| .test_indices
%     %||   { [ 3    [ 7    [ 11   ... [ 51    [ 55    [ 59
%     %||       4 ]    8 ]    12 ]       52 ]    56 ]    60 ]   }@1x15
%     %
%     % as above, but test on each modality after training on the other
%     % modality. There are 30 folds (5 subjects, 3 chunks, 2 modalities).
%     p=cosmo_nchoosek_partitioner(ds,1,'subject',[],'modality',[]);
%     cosmo_disp(p);
%     %|| .train_indices
%     %||   { [ 19         [ 15         [ 15        ... [  5         [  1         [  1
%     %||       20           16           16               6            2            2
%     %||       23           23           19               9            9            5
%     %||        :            :            :               :            :            :
%     %||       56           52           52              42           38           38
%     %||       59           59           55              45           45           41
%     %||       60 ]@16x1    60 ]@16x1    56 ]@16x1       46 ]@16x1    46 ]@16x1    42 ]@16x1   }@1x30
%     %|| .test_indices
%     %||   { [ 1    [ 5    [  9   ... [ 51    [ 55    [ 59
%     %||       2 ]    6 ]    10 ]       52 ]    56 ]    60 ]   }@1x30
%
% Notes:
%    - When k=1 and two input arguments, this function behaves equivalently
%      to cosmo_nfold_partitioner. Thus, in the most simple case
%      (nfold-partitioning), cosmo_nfold_partitioner with k=1 can be used
%      as well as this function.
%    - This function does not consider any .sa.targets (trial condition)
%      or .samples information.
%    - As shown in the examples below, this function can be used for
%      cross-modal and/or cross-participant cross-validation.
%    - For cross-validation it is recommended to balance partitions using
%      cosmo_balance_partitions.
%   - this function can be used for cross-decoding analyses. Doing so may
%     require a re-assignment of .sa.targets, and adding another sample
%     attribute to specify which samples are used for training and testing.
%     For example, consider the following dataset with six unique
%     conditions as specified in the sample attribute field .sa:
%
%       .targets    .chunks     .labels
%       1           1           'vis_dog'
%       2           1           'vis_cat'
%       3           1           'vis_frog'
%       4           1           'aud_dog'
%       5           1           'aud_cat'
%       6           1           'aud_frog'
%       1           2           'vis_dog'
%       2           2           'vis_cat'
%       :           :               :
%       6           8           'aud_frog'
%
%    This dataset has 8 chunks, each with 6 conditions: three for visual
%    stimuli of dogs, cats, and frogs, and three for auditory stimuli for
%    the same animals. The field .labels is not required, but used for a
%    human-readable description of the condition of each sample.
%
%    Suppose that one wants to do cross-decoding to see
%    if discrimination of animals generalizes between the visual and
%    auditory modalities.
%    To do so, the user has to:
%       * change the .targets field, to indicate the anima species,
%       * add another field (here 'modality') indicating which samples are
%         used for the cross-decoding
%
%    In this example, the sample attribute field .sa can be set as follows:
%
%       .targets    .chunks     .labels     .modality
%       1           1           'vis_dog'   1
%       2           1           'vis_cat'   1
%       3           1           'vis_frog'  1
%       1           1           'aud_dog'   2
%       2           1           'aud_cat'   2
%       3           1           'aud_frog'  2
%       1           2           'vis_dog'   1
%       2           2           'vis_cat'   1
%       :           :               :
%       6           8           'aud_frog'  2
%
%    so that:
%     * .modality corresponds to the visual (=1) and auditory (=2)
%       modality.
%     * .targets corresponds to the dog (=1), cat (=2), and frog (=3)
%       species.
%
%    With this re-assignment of .targets, testing on the auditory modality
%    and training on the visual modality with take-one-chunk out
%    cross-validation can be done using:
%
%       test_modality=2; % train on all other modalities (here: only 1)
%       partitions=cosmo_nchoosek_partitioner(ds,1,...
%                                   'modality',test_modality);
%
%    If test_modality is set to empty ([]), then both modalities are used
%    for training and for testing (in separate folds).
%
% See also: cosmo_nfold_partitioner, cosmo_balance_partitions
%
% #   For CoSMoMVPA's copyright information and license terms,   #
% #   see the COPYING file distributed with CoSMoMVPA.           #

    % if the first input is a dataset, get the chunks from it
    is_ds=isstruct(chunks_or_ds);
    if is_ds
        if isfield(chunks_or_ds,'sa') && isfield(chunks_or_ds.sa,'chunks')
            % it is a dataset
            ds=chunks_or_ds;
            chunks=ds.sa.chunks;
        else
            error('illegal struct input');
        end
    else
        chunks=chunks_or_ds;
    end

    % use helper function defined below
    partitions=nchoosek_partitioner(chunks,k);

    if nargin<3
        return;
    elseif mod(nargin,2)~=0
        error('Need even number of arguments');
    else
        nsamples=numel(chunks);
        for k=1:2:(nargin-2);
            % loop over grouping values
            group_values=varargin{k};
            test_group_by=varargin{k+1};

            % get sample attribute, if input was dataset
            if ischar(group_values)
                if is_ds
                    group_values=ds.sa.(group_values);
                else
                    error('String for group value requires dataset input');
                end
            end

            % check number of values matches
            ngroup_by=numel(group_values);
            if ngroup_by ~= nsamples
                error('group_by has %d values, expected %d', ...
                                ngroup_by, nsamples);
            end

            if isempty(test_group_by)
                test_group_by=num2cell(unique(group_values));
            elseif isnumeric(test_group_by)
                test_group_by={test_group_by};
            end

            % update partitions using helper function below
            partitions=group_by(partitions, group_values, test_group_by);
        end
    end


function partitions=group_by(partitions, group_values, test_group_by)
    % helper function to group partitions
    % the output has N times as many partitions as the input,
    % where N=numel(test_by).

    npartitions=numel(partitions.test_indices);

    assert(iscell(test_group_by));
    % see which values to split on
    ntest_group_by=numel(test_group_by);

    % allocate space for output
    train_indices=cell(1,ntest_group_by);
    test_indices=cell(1,ntest_group_by);

    % run for each unique value in group_by_values
    for m=1:ntest_group_by
        test_by=test_group_by{m};

        % allocate space for this iteration
        train_indices_cell=cell(1,npartitions);
        test_indices_cell=cell(1,npartitions);

        % some filtered partitions may be empty, so keep track of the
        % last position where a value was stored
        pos=0;
        for j=1:npartitions
            % get testing chunk indices
            p_test=partitions.test_indices{j};

            % see which ones match the group_by_value
            msk_test=cosmo_match(group_values(p_test),test_by);

            % keep just those indices
            p_test_masked=p_test(msk_test);

            % the same for training, but different from test_by
            p_train=partitions.train_indices{j};
            msk_train=~cosmo_match(group_values(p_train),test_by);
            p_train_masked=p_train(msk_train);

            if ~isempty(p_test_masked) && ~isempty(p_train_masked)
                % both training and test set are non-empty, so keep result
                pos=pos+1;
                test_indices_cell{pos}=p_test_masked;
                train_indices_cell{pos}=p_train_masked;
            end
        end

        % store the test-indices for m-th group_by_value
        test_indices{m}=test_indices_cell(1:pos);
        train_indices{m}=train_indices_cell(1:pos);
    end

    partitions=struct();

    % concatenate results for training and test indices
    partitions.train_indices=cat(2,train_indices{:});
    partitions.test_indices=cat(2,test_indices{:});





function partitions=nchoosek_partitioner(chunks,k)
% straightfoward partitioner




[unq,unused,chunk_indices]=unique(chunks);
nchunks=numel(unq);

if nchunks<2
    error(['at least two unique values for .sa.chunks are required, '...
                    'found %d.'], nchunks);
end

% deal with special 'half' case
is_symmetric=false;
if ischar(k)
    if strcmp(k,'half')
        k=.5;
        is_symmetric=true;
    else
        error('illegal string k');
    end
end
if isnumeric(k)
    if ~isscalar(k)
        error('k must be a scalar');
    end

    if k ~= round(k)
        k=round(nchunks*k);
    end

    if ~any(k==1:(nchunks-1))
        error('illegal k=%d: test class count should be between 1 and %d', ...
                    k, nchunks-1);
    end
else
    error('illegal parameter for k');
end

% little optimization: if just two chunks, the split is easy
if all(cosmo_match(chunks,[1 2]))
    chunk_msk1=chunks==1;
    chunk1_idxs=find(chunk_msk1);
    chunk2_idxs=find(~chunk_msk1);

    partitions=struct();
    if is_symmetric
        partitions.train_indices={chunk1_idxs};
        partitions.test_indices={chunk2_idxs};
    else
        partitions.train_indices={chunk1_idxs,chunk2_idxs};
        partitions.test_indices={chunk2_idxs,chunk1_idxs};
    end
    return
end

npartitions=nchoosek(nchunks,k);
combis=nchoosek(1:nchunks,k); % make all combinations

if is_symmetric && mod(nchunks,2)==0
    % when nclasses is even, return the first half of the permutations:
    % the current implementation of nchoosek results is that
    % combis(k,:) and combis(npartitions+1-k) are complementary
    % (i.e. together they make up 1:nchunks). In principle this could
    % change in the future leading to wrong results (if Mathworks, in its
    % infinite wisdom, decides to change the implementation of nchoosek),
    % so to be sure we check that the output matches what is expected.
    % The rationale is that this reduces computation time of subsequent
    % analyses by half, in particular for commonly used split half
    % correlations.
    nhalf=npartitions/2;

    check_combis=[combis(1:nhalf,:) combis(npartitions:-1:(nhalf+1),:)];

    % each row, when sorted, should be 1:nchunks
    matches=bsxfun(@eq,sort(check_combis,2),1:nchunks);
    assert(all(matches(:)),'nchoosek behaves weirdly');

    % we're good - just take the second half and update npartitions
    combis=combis(npartitions:-1:(nhalf+1),:);
    npartitions=nhalf;
end

% allocate space for output
train_indices=cell(1,npartitions);
test_indices=cell(1,npartitions);

% make all partitions
for j=1:npartitions
    combi=combis(j,:);
    sample_count=cosmo_match(chunk_indices,combi);
    test_indices{j}=find(sample_count);
    train_indices{j}=find(~sample_count);
end

partitions=struct();
partitions.train_indices=train_indices;
partitions.test_indices=test_indices;