cosmo crossvalidate

function [pred, accuracy] = cosmo_crossvalidate(ds, classifier, partitions, opt)
    % performs cross-validation using a classifier
    %
    % [pred, accuracy] = cosmo_crossvalidate(dataset, classifier, partitions, opt)
    %
    % Inputs
    %   ds                  struct with fields .samples (PxQ for P samples and
    %                       Q features) and .sa.targets (Px1 labels of samples)
    %   classifier          function handle to classifier, e.g.
    %                       @classify_naive_baysian
    %   partitions          For example the output from nfold_partition
    %   opt                 optional struct with options for classifier
    %     .normalization    optional, one of 'zscore','demean','scale_unit'
    %                       to normalize the data prior to classification using
    %                       zscoring, demeaning or scaling to [-1,1] along the
    %                       first dimension of ds. Normalization
    %                       parameters are estimated using the training data
    %                       and applied to the testing data.
    %     .pca_explained_count   optional, transform the data with PCA prior to
    %                            classification, and retain this number of
    %                            components
    %     .pca_explained_ratio   optional, transform the data with PCA prior to
    %                            classification, and retain the components that
    %                            explain this percentage of the variance
    %                            (value between 0-1)
    %    .check_partitions  optional (default: true). If set to false then
    %                       partitions are not checked for being set properly.
    %    .average_train_X   average the samples in the train set using
    %                       cosmo_average_samples. For X, use any parameter
    %                       supported by cosmo_average_samples, i.e. either
    %                       'count' or 'ratio', and optionally, 'resamplings'
    %                       or 'repeats'.
    %
    % Output
    %   pred                Qx1 array with predicted class labels.
    %                       elements with no predictions have the value NaN.
    %   accuracy            scalar classification accuracy
    %   test_chunks         Qx1 array with chunks of input dataset, if each
    %                       prediction was based using a single classification
    %                       step. Predictions with no or more than one
    %                       classification step are set to NaN
    %
    % Examples:
    %     % generate dataset with 3 targets and 4 chunks, first target is 3
    %     ds=cosmo_synthetic_dataset('ntargets',3,'nchunks',4,'target1',3);
    %     % use take-1-chunk for testing crossvalidation
    %     partitions=cosmo_nfold_partitioner(ds);
    %     classifier=@cosmo_classify_naive_bayes;
    %     % run crossvalidation
    %     [pred,accuracy]=cosmo_crossvalidate(ds, classifier, ...
    %                                                       partitions);
    %     % show targets, chunks, and predictions labels for each of the
    %     % four folds
    %     cosmo_disp({ds.sa.targets,ds.sa.chunks,pred},'threshold',inf)
    %     %|| { [ 3    [ 1    [   3   NaN   NaN   NaN
    %     %||     4      1        4   NaN   NaN   NaN
    %     %||     5      1        5   NaN   NaN   NaN
    %     %||     3      2      NaN     3   NaN   NaN
    %     %||     4      2      NaN     5   NaN   NaN
    %     %||     5      2      NaN     5   NaN   NaN
    %     %||     3      3      NaN   NaN     3   NaN
    %     %||     4      3      NaN   NaN     4   NaN
    %     %||     5      3      NaN   NaN     5   NaN
    %     %||     3      4      NaN   NaN   NaN     3
    %     %||     4      4      NaN   NaN   NaN     4
    %     %||     5 ]    4 ]    NaN   NaN   NaN     5 ] }
    %     cosmo_disp(accuracy)
    %     %||  0.917
    %     %
    %     % use take-2-chunks out for testing crossvalidation, LDA classifier
    %     partitions=cosmo_nchoosek_partitioner(ds,2);
    %     classifier=@cosmo_classify_lda;
    %     % run crossvalidation
    %     [pred,accuracy]=cosmo_crossvalidate(ds, classifier, ...
    %                                                           partitions);
    %     % show targets, chunks, and predictions labels for each of the
    %     % four folds
    %     cosmo_disp({ds.sa.targets,ds.sa.chunks,pred},'threshold',inf)
    %     %|| { [ 3    [ 1    [   5     5     3   NaN   NaN   NaN
    %     %||     4      1        4     4     4   NaN   NaN   NaN
    %     %||     5      1        5     5     4   NaN   NaN   NaN
    %     %||     3      2        3   NaN   NaN     3     3   NaN
    %     %||     4      2        4   NaN   NaN     4     4   NaN
    %     %||     5      2        5   NaN   NaN     4     5   NaN
    %     %||     3      3      NaN     5   NaN     3   NaN     3
    %     %||     4      3      NaN     4   NaN     4   NaN     4
    %     %||     5      3      NaN     5   NaN     5   NaN     5
    %     %||     3      4      NaN   NaN     3   NaN     3     3
    %     %||     4      4      NaN   NaN     4   NaN     4     5
    %     %||     5 ]    4 ]    NaN   NaN     5   NaN     3     3 ] }
    %     cosmo_disp(accuracy)
    %     %||   0.778
    %     %
    %     % as the example above, but (1) use z-scoring on each training set
    %     % and apply the estimated mean and std to the test set, and (2)
    %     % use odd-even partitioner
    %     opt=struct();
    %     opt.normalization='zscore';
    %     partitions=cosmo_oddeven_partitioner(ds);
    %     % run crossvalidation
    %     [pred,accuracy]=cosmo_crossvalidate(ds, classifier, ...
    %                                                        partitions, opt);
    %     % show targets, predicted labels, and accuracy
    %     cosmo_disp({ds.sa.targets,ds.sa.chunks,pred},'threshold',inf)
    %     %|| { [ 3    [ 1    [ NaN     5
    %     %||     4      1      NaN     4
    %     %||     5      1      NaN     5
    %     %||     3      2        3   NaN
    %     %||     4      2        4   NaN
    %     %||     5      2        5   NaN
    %     %||     3      3      NaN     5
    %     %||     4      3      NaN     4
    %     %||     5      3      NaN     5
    %     %||     3      4        3   NaN
    %     %||     4      4        4   NaN
    %     %||     5 ]    4 ]      3   NaN ] }
    %     cosmo_disp(accuracy)
    %     %||   0.75
    %
    % Notes:
    %   - to apply this to a dataset struct as a measure (for searchlights),
    %     consider using cosmo_crossvalidation_measure
    %   - to average samples in the training set prior to training, use the
    %     options provided by cosmo_average_samples prefixed by
    %     'average_train_'. For example, to take averages of 5 samples, and use
    %     each sample in the input approximately 4 times, use:
    %           opt.average_train_count=5;
    %           opt.average_train_resamplings=4;
    %
    % See also: cosmo_crossvalidation_measure, cosmo_average_samples
    %
    % #   For CoSMoMVPA's copyright information and license terms,   #
    % #   see the COPYING file distributed with CoSMoMVPA.           #

    if nargin < 4
        opt = struct();
    end
    if ~isfield(opt, 'normalization')
        opt.normalization = [];
    end
    if ~isfield(opt, 'check_partitions')
        opt.check_partitions = true;
    end

    if ~isempty(opt.normalization)
        normalization = opt.normalization;
        opt.autoscale = false; % disable for {matlab,lib}svm classifiers
    else
        normalization = [];
    end

    if all(isfield(opt, {'pca_explained_count', 'pca_explained_ratio'}))
        error(['pca_explained_count and pca_explained_ratio are ' ...
               'mutually exclusive']);
    elseif isfield(opt, 'pca_explained_count')
        arg_pca = 'pca_explained_count';
        arg_pca_value = opt.pca_explained_count;
    elseif isfield(opt, 'pca_explained_ratio')
        arg_pca = 'pca_explained_ratio';
        arg_pca_value = opt.pca_explained_ratio;
    else
        arg_pca = [];
    end

    if opt.check_partitions
        cosmo_check_partitions(partitions, ds);
    end

    % see if samples in training set are to be averaged
    [do_average_train, average_train_opt] = get_average_train_opt(opt);

    train_indices = partitions.train_indices;
    test_indices = partitions.test_indices;

    npartitions = numel(train_indices);

    nsamples = size(ds.samples, 1);

    % space for output (one column per partition)
    % the k-th column contains predictions for the k-th fold
    % (with values of NaNs if there was no prediction)
    pred = NaN(nsamples, npartitions);

    targets = ds.sa.targets;

    % process each fold
    for fold = 1:npartitions
        train_idxs = train_indices{fold};
        test_idxs = test_indices{fold};
        % for each partition get the training and test data, store in
        % train_data and test_data
        train_data = ds.samples(train_idxs, :);
        train_targets = targets(train_idxs);

        if do_average_train
            % make a minimal dataset, then use cosmo_average_samples to
            % compute averages
            n_train = numel(train_idxs);
            ds_train = struct();
            ds_train.samples = train_data;
            ds_train.sa.chunks = ones(n_train, 1);
            ds_train.sa.targets = train_targets;

            ds_train_avg = cosmo_average_samples(ds_train, average_train_opt);
            train_data = ds_train_avg.samples;
            train_targets = ds_train_avg.sa.targets;
        end

        test_data = ds.samples(test_idxs, :);

        % apply pca
        if ~isempty(arg_pca)
            [train_data, pca_params] = cosmo_map_pca(train_data, ...
                                                     arg_pca, arg_pca_value);
            test_data = cosmo_map_pca(test_data, 'pca_params', pca_params);
        end

        % apply normalization
        if ~isempty(normalization)
            [train_data, params] = cosmo_normalize(train_data, normalization);
            test_data = cosmo_normalize(test_data, params);
        end

        % then get predictions for the training samples using
        % the classifier, and store these in the k-th column of all_pred.
        p = classifier(train_data, train_targets, test_data, opt);

        pred(test_idxs, fold) = p;
    end

    % compute accuracies
    has_prediction_mask = ~isnan(pred);
    correct_mask = bsxfun(@eq, targets, pred) & has_prediction_mask;

    accuracy = sum(correct_mask) / sum(has_prediction_mask);

function [do_average_train, average_train_opt] = get_average_train_opt(opt)
    persistent cached_opt
    persistent cached_do_average_train
    persistent cached_average_train_opt

    if ~isequal(cached_opt, opt)
        cached_average_train_opt = struct();
        cached_do_average_train = false;
        prefix = 'average_train_';
        keys = fieldnames(opt);
        for k = 1:numel(keys)
            key = keys{k};
            sp = cosmo_strsplit(key, prefix);
            if isempty(sp{1})
                % starts with average_train, so take the rest of the key
                % and flag cached_do_average_train
                cached_average_train_opt.(sp{2}) = opt.(key);
                cached_do_average_train = true;
            end
        end

        cached_opt = opt;
    end

    do_average_train = cached_do_average_train;
    average_train_opt = cached_average_train_opt;