cosmo nchoosek partitioner

function partitions = cosmo_nchoosek_partitioner(chunks_or_ds, k, varargin)
    % partitions for into nchoosek(n,k) partitions with optional grouping schemas.
    %
    % partitions=cosmo_nchoosek_partitioner(chunks, k, group_values1, test_group_by1,...)
    %
    % Input
    %  - chunks          Px1 chunk indices for P samples. It can also be a
    %                    dataset struct with field .sa.chunks
    %  - k               When an integer, k chunks are in each test partition.
    %                    When between 0 and 1, this is interpreted as
    %                    round(k*nchunks) where nchunks is the number of unique
    %                    chunks in chunks.
    %                    A special case, mostly aimed at split-half
    %                    correlations, is when k=='half'; this sets k to .5,
    %                    and if k is even, it treats train_indices and
    %                    test_indices as symmetrical, meaning it returns only
    %                    half the number of partitions (avoiding duplicates).
    %                    If k is odd then train and test indices have
    %                    (k+1)/nchunks and (k-1)/nchunks elements,
    %                    respectively, or vice versa.
    %  - group_values*   } Intended for cross-participant or cross-condition
    %  - test_group_by*  } generalizability analyses.
    %                      Pairs of these determine a subsequent level group
    %                      partition scheme. Each group_values can be
    %                      a cell with the labels for a test group, or,
    %                      if chunks is a string, the name of a sample
    %                      attribute whose values are used as labels.
    %                      Each test_group_by indicates which values in
    %                      group_values are used as a test value in the
    %                      cross-validation scheme (and all other ones are used
    %                      as training value). If empty it is set to the unique
    %                      values in group_values.
    %
    % Output:
    %  - partitions      A struct with fields:
    %     .train_indices } Each of these is an 1xQ cell for Q partitions, where
    %     .test_indices  } Q=nchoosek(nchunks,k)). It contains all possible
    %                      combinations with k test indices and (nchunks-k)
    %                      training indices (except when k=='half', see above).
    %                    If group_values* and test_group_by* are specified,
    %                    then the number of output partitions is multiplied
    %                    by the product of the number of values in
    %                    test_group_by*.
    %
    %
    % Examples:
    %     % make a simple dataset with 4 chunks, 2 samples each
    %     % assume two targets (i.e. conditions, say piano versus guitar)
    %     ds=struct();
    %     ds.samples=randn(8,99); % 8 samples, 99
    %     ds.sa.targets=[1 1 1 1 2 2 2 2]';
    %     ds.sa.chunks=2+[1 2 3 4 4 3 2 1]';
    %     cosmo_check_dataset(ds); % sanity check
    %     %
    %     % take-one-chunk out partitioning
    %     p=cosmo_nchoosek_partitioner(ds,1);
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [ 2    [ 1    [ 1    [ 1
    %     %||       3      3      2      2
    %     %||       4      4      4      3
    %     %||       5      5      5      6
    %     %||       6      6      7      7
    %     %||       7 ]    8 ]    8 ]    8 ] }
    %     %|| .test_indices
    %     %||   { [ 1    [ 2    [ 3    [ 4
    %     %||       8 ]    7 ]    6 ]    5 ] }
    %     %
    %     % take-two chunks out partitioning
    %     p=cosmo_nchoosek_partitioner(ds,2);
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [ 3    [ 2    [ 2    [ 1    [ 1    [ 1
    %     %||       4      4      3      4      3      2
    %     %||       5      5      6      5      6      7
    %     %||       6 ]    7 ]    7 ]    8 ]    8 ]    8 ] }
    %     %|| .test_indices
    %     %||   { [ 1    [ 1    [ 1    [ 2    [ 2    [ 3
    %     %||       2      3      4      3      4      4
    %     %||       7      6      5      6      5      5
    %     %||       8 ]    8 ]    8 ]    7 ]    7 ]    6 ] }
    %     %
    %     % take-half-of-the-chunks out partitioning
    %     % (this effectively gives same chunks as above)
    %     p_alt=cosmo_nchoosek_partitioner(ds,.5);
    %     isequal(p, p_alt)
    %     %|| true
    %     %
    %     % do half split (for correlation measure); this leaves out
    %     % mirror partitions of train and test indices
    %     p=cosmo_nchoosek_partitioner(ds,'half');
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [ 1    [ 1    [ 1
    %     %||       2      3      4
    %     %||       7      6      5
    %     %||       8 ]    8 ]    8 ] }
    %     %|| .test_indices
    %     %||   { [ 3    [ 2    [ 2
    %     %||       4      4      3
    %     %||       5      5      6
    %     %||       6 ]    7 ]    7 ] }
    %     %
    %     % test on samples with chunk=3 only using take-one-fold out
    %     p=cosmo_nchoosek_partitioner(ds,1,'chunks',3);
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [ 2
    %     %||       3
    %     %||       4
    %     %||       5
    %     %||       6
    %     %||       7 ] }
    %     %|| .test_indices
    %     %||   { [ 1
    %     %||       8 ] }
    %     % test on samples with chunk=[3 4] only using take-one-fold out;
    %     % only samples with chunks=1 or 2 are used for training
    %     p=cosmo_nchoosek_partitioner(ds,1,'chunks',{[3 4]});
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [ 3    [ 3
    %     %||       4      4
    %     %||       5      5
    %     %||       6 ]    6 ] }
    %     %|| .test_indices
    %     %||   { [ 1    [ 2
    %     %||       8 ]    7 ] }
    %     % test separately on samples with chunk=3 and samples with chunk=4;
    %     % in some folds, samples with chunks=1,2,4 are used for training, in
    %     % other folds samples with chunks=1,2,3 are used for training
    %     p=cosmo_nchoosek_partitioner(ds,1,'chunks',{[3],[4]});
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [ 2    [ 1
    %     %||       3      3
    %     %||       4      4
    %     %||       5      5
    %     %||       6      6
    %     %||       7 ]    8 ] }
    %     %|| .test_indices
    %     %||   { [ 1    [ 2
    %     %||       8 ]    7 ] }
    %     %||
    %
    %
    %     % make a slightly more complicated dataset: with three chunks,
    %     % suppose there are two modalities (e.g. (1) visual and (2)
    %     % auditory stimulation) which are stored in an
    %     % additional field 'modality'
    %     ds=struct();
    %     ds.samples=randn(12,99);
    %     ds.sa.chunks  =[1 1 1 1 2 2 2 2 3 3 3 3]';
    %     ds.sa.targets =[1 2 1 2 1 2 1 2 1 2 1 2]';
    %     ds.sa.modality=[1 1 2 2 1 1 2 2 1 1 2 2]';
    %     cosmo_check_dataset(ds);
    %     %
    %     % take-one-chunk out, test on samples with modality=1 (and train
    %     % on samples with other modalities, i.e. modality=2)
    %     p=cosmo_nchoosek_partitioner(ds,1,'modality',1);
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [  7    [  3    [ 3
    %     %||        8       4      4
    %     %||       11      11      7
    %     %||       12 ]    12 ]    8 ] }
    %     %|| .test_indices
    %     %||   { [ 1    [ 5    [  9
    %     %||       2 ]    6 ]    10 ] }
    %     %
    %     % take-one-chunk out, test on samples with modality=2
    %     p=cosmo_nchoosek_partitioner(ds,1,'modality',2);
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [  5    [  1    [ 1
    %     %||        6       2      2
    %     %||        9       9      5
    %     %||       10 ]    10 ]    6 ] }
    %     %|| .test_indices
    %     %||   { [ 3    [ 7    [ 11
    %     %||       4 ]    8 ]    12 ] }
    %     % take-one-chunk out, test on samples with modality=1 (and train on
    %     % modality=2) and vice verse
    %     p=cosmo_nchoosek_partitioner(ds,1,'modality',[]);
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [  7    [  3    [ 3    [  5    [  1    [ 1
    %     %||        8       4      4       6       2      2
    %     %||       11      11      7       9       9      5
    %     %||       12 ]    12 ]    8 ]    10 ]    10 ]    6 ] }
    %     %|| .test_indices
    %     %||   { [ 1    [ 5    [  9    [ 3    [ 7    [ 11
    %     %||       2 ]    6 ]    10 ]    4 ]    8 ]    12 ] }
    %
    %     % between-subject classification: 3 chunks, 2 modalities, 5 subjects
    %     ds=struct();
    %     ds.samples=randn(60,99);
    %     ds.sa.targets=repmat([1 2],1,30)';
    %     ds.sa.chunks=repmat([1 1 1 1 2 2 2 2 3 3 3 3],1,5)';
    %     ds.sa.modality=repmat([1 1 2 2],1,15)';
    %     ds.sa.subject=kron(1:5,ones(1,12))';
    %     cosmo_check_dataset(ds);
    %     %
    %     % test on subject 1, train on other subjects using take-one-chunk out
    %     p=cosmo_nchoosek_partitioner(ds,1,'subject',1);
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [ 17         [ 13         [ 13
    %     %||       18           14           14
    %     %||       19           15           15
    %     %||        :            :            :
    %     %||       58           58           54
    %     %||       59           59           55
    %     %||       60 ]@32x1    60 ]@32x1    56 ]@32x1 }
    %     %|| .test_indices
    %     %||   { [ 1    [ 5    [  9
    %     %||       2      6      10
    %     %||       3      7      11
    %     %||       4 ]    8 ]    12 ] }
    %     %
    %     % test on each subject after training on each other subject
    %     % in each fold, the test data is from one subject and one chunk,
    %     % and the train data from all other subjects and all other chunks.
    %     % since there are 5 subjects and 3 chunks, there are 15 folds.
    %     p=cosmo_nchoosek_partitioner(ds,1,'subject',[]);
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [ 17         [ 13         [ 13        ... [  5         [  1         [  1
    %     %||       18           14           14               6            2            2
    %     %||       19           15           15               7            3            3
    %     %||        :            :            :               :            :            :
    %     %||       58           58           54              46           46           42
    %     %||       59           59           55              47           47           43
    %     %||       60 ]@32x1    60 ]@32x1    56 ]@32x1       48 ]@32x1    48 ]@32x1    44 ]@32x1   }@1x15
    %     %|| .test_indices
    %     %||   { [ 1    [ 5    [  9   ... [ 49    [ 53    [ 57
    %     %||       2      6      10         50      54      58
    %     %||       3      7      11         51      55      59
    %     %||       4 ]    8 ]    12 ]       52 ]    56 ]    60 ]   }@1x15
    %     %
    %     % as above, but test on modality=2 (and train on other values for
    %     % modality, i.e. modality=1)
    %     p=cosmo_nchoosek_partitioner(ds,1,'subject',[],'modality',2);
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [ 17         [ 13         [ 13        ... [  5         [  1         [  1
    %     %||       18           14           14               6            2            2
    %     %||       21           21           17               9            9            5
    %     %||        :            :            :               :            :            :
    %     %||       54           50           50              42           38           38
    %     %||       57           57           53              45           45           41
    %     %||       58 ]@16x1    58 ]@16x1    54 ]@16x1       46 ]@16x1    46 ]@16x1    42 ]@16x1   }@1x15
    %     %|| .test_indices
    %     %||   { [ 3    [ 7    [ 11   ... [ 51    [ 55    [ 59
    %     %||       4 ]    8 ]    12 ]       52 ]    56 ]    60 ]   }@1x15
    %     %
    %     % as above, but test on each modality after training on the other
    %     % modality. There are 30 folds (5 subjects, 3 chunks, 2 modalities).
    %     p=cosmo_nchoosek_partitioner(ds,1,'subject',[],'modality',[]);
    %     cosmo_disp(p);
    %     %|| .train_indices
    %     %||   { [ 19         [ 15         [ 15        ... [  5         [  1         [  1
    %     %||       20           16           16               6            2            2
    %     %||       23           23           19               9            9            5
    %     %||        :            :            :               :            :            :
    %     %||       56           52           52              42           38           38
    %     %||       59           59           55              45           45           41
    %     %||       60 ]@16x1    60 ]@16x1    56 ]@16x1       46 ]@16x1    46 ]@16x1    42 ]@16x1   }@1x30
    %     %|| .test_indices
    %     %||   { [ 1    [ 5    [  9   ... [ 51    [ 55    [ 59
    %     %||       2 ]    6 ]    10 ]       52 ]    56 ]    60 ]   }@1x30
    %
    % Notes:
    %    - When k=1 and two input arguments, this function behaves equivalently
    %      to cosmo_nfold_partitioner. Thus, in the most simple case
    %      (nfold-partitioning), cosmo_nfold_partitioner with k=1 can be used
    %      as well as this function.
    %    - This function does not consider any .sa.targets (trial condition)
    %      or .samples information.
    %    - As shown in the examples below, this function can be used for
    %      cross-modal and/or cross-participant cross-validation.
    %    - For cross-validation it is recommended to balance partitions using
    %      cosmo_balance_partitions.
    %   - this function can be used for cross-decoding analyses. Doing so may
    %     require a re-assignment of .sa.targets, and adding another sample
    %     attribute to specify which samples are used for training and testing.
    %     For example, consider the following dataset with six unique
    %     conditions as specified in the sample attribute field .sa:
    %
    %       .targets    .chunks     .labels
    %       1           1           'vis_dog'
    %       2           1           'vis_cat'
    %       3           1           'vis_frog'
    %       4           1           'aud_dog'
    %       5           1           'aud_cat'
    %       6           1           'aud_frog'
    %       1           2           'vis_dog'
    %       2           2           'vis_cat'
    %       :           :               :
    %       6           8           'aud_frog'
    %
    %    This dataset has 8 chunks, each with 6 conditions: three for visual
    %    stimuli of dogs, cats, and frogs, and three for auditory stimuli for
    %    the same animals. The field .labels is not required, but used for a
    %    human-readable description of the condition of each sample.
    %
    %    Suppose that one wants to do cross-decoding to see
    %    if discrimination of animals generalizes between the visual and
    %    auditory modalities.
    %    To do so, the user has to:
    %       * change the .targets field, to indicate the anima species,
    %       * add another field (here 'modality') indicating which samples are
    %         used for the cross-decoding
    %
    %    In this example, the sample attribute field .sa can be set as follows:
    %
    %       .targets    .chunks     .labels     .modality
    %       1           1           'vis_dog'   1
    %       2           1           'vis_cat'   1
    %       3           1           'vis_frog'  1
    %       1           1           'aud_dog'   2
    %       2           1           'aud_cat'   2
    %       3           1           'aud_frog'  2
    %       1           2           'vis_dog'   1
    %       2           2           'vis_cat'   1
    %       :           :               :
    %       6           8           'aud_frog'  2
    %
    %    so that:
    %     * .modality corresponds to the visual (=1) and auditory (=2)
    %       modality.
    %     * .targets corresponds to the dog (=1), cat (=2), and frog (=3)
    %       species.
    %
    %    With this re-assignment of .targets, testing on the auditory modality
    %    and training on the visual modality with take-one-chunk out
    %    cross-validation can be done using:
    %
    %       test_modality=2; % train on all other modalities (here: only 1)
    %       partitions=cosmo_nchoosek_partitioner(ds,1,...
    %                                   'modality',test_modality);
    %
    %    If test_modality is set to empty ([]), then both modalities are used
    %    for training and for testing (in separate folds).
    %
    % See also: cosmo_nfold_partitioner, cosmo_balance_partitions
    %
    % #   For CoSMoMVPA's copyright information and license terms,   #
    % #   see the COPYING file distributed with CoSMoMVPA.           #

    % if the first input is a dataset, get the chunks from it
    is_ds = isstruct(chunks_or_ds);
    if is_ds
        if isfield(chunks_or_ds, 'sa') && isfield(chunks_or_ds.sa, 'chunks')
            % it is a dataset
            ds = chunks_or_ds;
            chunks = ds.sa.chunks;
        else
            error('illegal struct input');
        end
    else
        chunks = chunks_or_ds;
    end

    % use helper function defined below
    partitions = nchoosek_partitioner(chunks, k);

    if nargin < 3
        return
    elseif mod(nargin, 2) ~= 0
        error('Need even number of arguments');
    else
        nsamples = numel(chunks);
        for k = 1:2:(nargin - 2)
            % loop over grouping values
            group_values = varargin{k};
            test_group_by = varargin{k + 1};

            % get sample attribute, if input was dataset
            if ischar(group_values)
                if is_ds
                    group_values = ds.sa.(group_values);
                else
                    error('String for group value requires dataset input');
                end
            end

            % check number of values matches
            ngroup_by = numel(group_values);
            if ngroup_by ~= nsamples
                error('group_by has %d values, expected %d', ...
                      ngroup_by, nsamples);
            end

            if isempty(test_group_by)
                test_group_by = num2cell(unique(group_values));
            elseif isnumeric(test_group_by)
                test_group_by = {test_group_by};
            end

            % update partitions using helper function below
            partitions = group_by(partitions, group_values, test_group_by);
        end
    end

function partitions = group_by(partitions, group_values, test_group_by)
    % helper function to group partitions
    % the output has N times as many partitions as the input,
    % where N=numel(test_by).

    npartitions = numel(partitions.test_indices);

    assert(iscell(test_group_by));
    % see which values to split on
    ntest_group_by = numel(test_group_by);

    % allocate space for output
    train_indices = cell(1, ntest_group_by);
    test_indices = cell(1, ntest_group_by);

    % run for each unique value in group_by_values
    for m = 1:ntest_group_by
        test_by = test_group_by{m};

        % allocate space for this iteration
        train_indices_cell = cell(1, npartitions);
        test_indices_cell = cell(1, npartitions);

        % some filtered partitions may be empty, so keep track of the
        % last position where a value was stored
        pos = 0;
        for j = 1:npartitions
            % get testing chunk indices
            p_test = partitions.test_indices{j};

            % see which ones match the group_by_value
            msk_test = cosmo_match(group_values(p_test), test_by);

            % keep just those indices
            p_test_masked = p_test(msk_test);

            % the same for training, but different from test_by
            p_train = partitions.train_indices{j};
            msk_train = ~cosmo_match(group_values(p_train), test_by);
            p_train_masked = p_train(msk_train);

            if ~isempty(p_test_masked) && ~isempty(p_train_masked)
                % both training and test set are non-empty, so keep result
                pos = pos + 1;
                test_indices_cell{pos} = p_test_masked;
                train_indices_cell{pos} = p_train_masked;
            end
        end

        % store the test-indices for m-th group_by_value
        test_indices{m} = test_indices_cell(1:pos);
        train_indices{m} = train_indices_cell(1:pos);
    end

    partitions = struct();

    % concatenate results for training and test indices
    partitions.train_indices = cat(2, train_indices{:});
    partitions.test_indices = cat(2, test_indices{:});

function partitions = nchoosek_partitioner(chunks, k)
    % straightforward partitioner

    [unq, unused, chunk_indices] = unique(chunks);
    nchunks = numel(unq);

    if nchunks < 2
        error(['at least two unique values for .sa.chunks are required, '...
               'found %d.'], nchunks);
    end

    % deal with special 'half' case
    is_symmetric = false;
    if ischar(k)
        if strcmp(k, 'half')
            k = .5;
            is_symmetric = true;
        else
            error('illegal string k');
        end
    end
    if isnumeric(k)
        if ~isscalar(k)
            error('k must be a scalar');
        end

        if k ~= round(k)
            k = round(nchunks * k);
        end

        if ~any(k == 1:(nchunks - 1))
            error('illegal k=%d: test class count should be between 1 and %d', ...
                  k, nchunks - 1);
        end
    else
        error('illegal parameter for k');
    end

    % little optimization: if just two chunks, the split is easy
    if all(cosmo_match(chunks, [1 2]))
        chunk_msk1 = chunks == 1;
        chunk1_idxs = find(chunk_msk1);
        chunk2_idxs = find(~chunk_msk1);

        partitions = struct();
        if is_symmetric
            partitions.train_indices = {chunk1_idxs};
            partitions.test_indices = {chunk2_idxs};
        else
            partitions.train_indices = {chunk1_idxs, chunk2_idxs};
            partitions.test_indices = {chunk2_idxs, chunk1_idxs};
        end
        return
    end

    npartitions = nchoosek(nchunks, k);
    combis = nchoosek(1:nchunks, k); % make all combinations

    if is_symmetric && mod(nchunks, 2) == 0
        % when nclasses is even, return the first half of the permutations:
        % the current implementation of nchoosek results is that
        % combis(k,:) and combis(npartitions+1-k) are complementary
        % (i.e. together they make up 1:nchunks). In principle this could
        % change in the future leading to wrong results (if Mathworks, in its
        % infinite wisdom, decides to change the implementation of nchoosek),
        % so to be sure we check that the output matches what is expected.
        % The rationale is that this reduces computation time of subsequent
        % analyses by half, in particular for commonly used split half
        % correlations.
        nhalf = npartitions / 2;

        check_combis = [combis(1:nhalf, :) combis(npartitions:-1:(nhalf + 1), :)];

        % each row, when sorted, should be 1:nchunks
        matches = bsxfun(@eq, sort(check_combis, 2), 1:nchunks);
        assert(all(matches(:)), 'nchoosek behaves weirdly');

        % we're good - just take the second half and update npartitions
        combis = combis(npartitions:-1:(nhalf + 1), :);
        npartitions = nhalf;
    end

    % allocate space for output
    train_indices = cell(1, npartitions);
    test_indices = cell(1, npartitions);

    % make all partitions
    for j = 1:npartitions
        combi = combis(j, :);
        sample_count = cosmo_match(chunk_indices, combi);
        test_indices{j} = find(sample_count);
        train_indices{j} = find(~sample_count);
    end

    partitions = struct();
    partitions.train_indices = train_indices;
    partitions.test_indices = test_indices;