cosmo crossvalidation measure

function ds_sa = cosmo_crossvalidation_measure(ds, varargin)
% performs cross-validation using a classifier
%
% accuracy = cosmo_cross_validation_accuracy_measure(dataset, args)
%
% Inputs:
%   dataset             struct with fields .samples (PxQ for P samples and
%                       Q features) and .sa.targets (Px1 labels of samples)
%   args                struct containing classifier, partitions, and
%                       possibly other fields that are given to the
%                       classifier.
%   args.classifier     function handle to classifier, e.g.
%                       @cosmo_classify_lda
%   args.partitions     Partition scheme, for example the output from
%                       cosmo_nfold_partitioner
%   args.output         One of:
%                       - 'accuracy':           overall classificationa
%                                               accuracy (default)
%                       - 'balanced_accuracy'   classificationa accuracy is
%                                               computed for each class,
%                                               then averaged
%                       - 'fold_accuracy'       classification accuracy is
%                                               computed for each fold
%                       - 'winner_predictions'  class that was predicted
%                                               for each sample in the test
%                                               set; in the case of
%                                               multiple predictions, ties
%                                               are decided in
%                                               pseudo-random fashion by
%                                               cosmo_winner_indices. The
%                                               output .samples field
%                                               contains predicted labels.
%                       - 'fold_predictions'    prediction for each sample
%                                               in each fold. The
%                                               output .samples field
%                                               contains predicted labels.
%                                               With this option, the
%                                               output has a field
%                                               .sa.folds
%   args.check_partitions  optional (default: true). If set to false then
%                          partitions are not checked for being set
%                          properly.
%   args.normalization  optional, one of 'zscore','demean','scale_unit'
%                       to normalize the data prior to classification using
%                       zscoring, demeaning or scaling to [-1,1] along the
%                       first dimension of ds. Normalization parameters are
%                       estimated using the training data and applied to
%                       the testing data (for each feature separately).
%     .pca_explained_count   optional, transform the data with PCA prior to
%                            classification, and retain this number of
%                            components
%     .pca_explained_ratio   optional, transform the data with PCA prior to
%                            classification, and retain the components that
%                            explain this percentage of the variance
%                            (value between 0-1)
%   args.average_train_X  average the samples in the train set using
%                       cosmo_average_samples. For X, use any parameter
%                       supported by cosmo_average_samples, i.e. either
%                       'count' or 'ratio', and optionally, 'resamplings'
%                       or 'repeats'.
%
% Output:
%    ds_sa        Struct with fields:
%      .samples   Scalar with classification accuracy, or vector with
%                 predictions for the input samples.
%      .sa        Struct with field:
%                 - if args.output=='accuracy':
%                       .labels  =={'accuracy'}
%                 - if args.output=='predictions'
%                       .targets     } Px1 true and predicted labels of
%                       .predictions } each sample
%
% Examples:
%     ds=cosmo_synthetic_dataset('ntargets',3,'nchunks',4);
%     %
%     % use take-1-chunk for testing crossvalidation
%     opt=struct();
%     opt.partitions=cosmo_nfold_partitioner(ds);
%     opt.classifier=@cosmo_classify_naive_bayes;
%     % run crossvalidation and return accuracy (the default)
%     acc_ds=cosmo_crossvalidation_measure(ds,opt);
%     cosmo_disp(acc_ds);
%     %|| .samples
%     %||   [ 0.917 ]
%     %|| .sa
%     %||   .labels
%     %||     { 'accuracy' }
%
%     % let the measure return predictions instead of accuracy,
%     % and use take-1-chunks out for testing crossvalidation;
%     % use LDA classifer and let targets be in range 7..9
%     ds=cosmo_synthetic_dataset('ntargets',3,'nchunks',4,'target1',7);
%     opt=struct();
%     opt.partitions=cosmo_nchoosek_partitioner(ds,1);
%     opt.output='winner_predictions';
%     opt.classifier=@cosmo_classify_lda;
%     pred_ds=cosmo_crossvalidation_measure(ds,opt);
%     %
%     % show results. Because each sample was predicted just once,
%     % .sa.chunks contains the chunks of the original input
%     cosmo_disp(pred_ds);
%     %|| .samples
%     %||   [ 9
%     %||     8
%     %||     9
%     %||     :
%     %||     7
%     %||     9
%     %||     7 ]@12x1
%     %|| .sa
%     %||   .targets
%     %||     [ 7
%     %||       8
%     %||       9
%     %||       :
%     %||       7
%     %||       8
%     %||       9 ]@12x1
%     %||
%     %
%     % return accuracy, but use z-scoring on each training set
%     % and apply the estimated mean and std to the test set.
%     % Use take-2-chunks out for corssvalidation
%     ds=cosmo_synthetic_dataset('ntargets',3,'nchunks',4);
%     opt=struct();
%     opt.output='accuracy';
%     opt.normalization='zscore';
%     opt.classifier=@cosmo_classify_lda;
%     opt.partitions=cosmo_nchoosek_partitioner(ds,2);
%     z_acc_ds=cosmo_crossvalidation_measure(ds,opt);
%     cosmo_disp(z_acc_ds);
%     %|| .samples
%     %||   [ 0.75 ]
%     %|| .sa
%     %||   .labels
%     %||     { 'accuracy' }
%
%     % illustrate accuracy for partial test set
%     ds=cosmo_synthetic_dataset('ntargets',2,'nchunks',5);
%     %
%     % use take-1-chunk out for testing crossvalidation, but only test on
%     % chunks 2 and 4
%     opt=struct();
%     opt.partitions=cosmo_nchoosek_partitioner(ds,1,'chunks',[2 4]);
%     opt.classifier=@cosmo_classify_naive_bayes;
%     % run crossvalidation and return accuracy (the default)
%     acc_ds=cosmo_crossvalidation_measure(ds,opt);
%     % show accuracy
%     cosmo_disp(acc_ds.samples)
%     %|| 0.75
%     % show predictions
%     opt.output='winner_predictions';
%     pred_ds=cosmo_crossvalidation_measure(ds,opt);
%     cosmo_disp([pred_ds.samples pred_ds.sa.targets],'threshold',inf);
%     %|| [ NaN         1
%     %||   NaN         2
%     %||     1         1
%     %||     2         2
%     %||   NaN         1
%     %||   NaN         2
%     %||     1         1
%     %||     1         2
%     %||   NaN         1
%     %||   NaN         2 ]
%
%     % return predictions for each fold, using output='fold_predictions'.
%     % The output has now multiple predictions for each original input
%     % sample.
%     ds=cosmo_synthetic_dataset('ntargets',3,'nchunks',4,'target1',7);
%     opt=struct();
%     % use take-two-chunks-for-testing crossvalidation, resuling in
%     % nchoosek(4,2)=6 folds
%     opt.partitions=cosmo_nchoosek_partitioner(ds,2);
%     cosmo_disp(opt.partitions);
%     %|| .train_indices
%     %||   { [  7    [  4    [ 4    [  1    [ 1    [ 1
%     %||        8       5      5       2      2      2
%     %||        9       6      6       3      3      3
%     %||       10      10      7      10      7      4
%     %||       11      11      8      11      8      5
%     %||       12 ]    12 ]    9 ]    12 ]    9 ]    6 ] }
%     %|| .test_indices
%     %||   { [ 1    [ 1    [  1    [ 4    [  4    [  7
%     %||       2      2       2      5       5       8
%     %||       3      3       3      6       6       9
%     %||       4      7      10      7      10      10
%     %||       5      8      11      8      11      11
%     %||       6 ]    9 ]    12 ]    9 ]    12 ]    12 ] }
%     %
%     % set output to return predictions for each fold
%     opt.output='fold_predictions';
%     opt.classifier=@cosmo_classify_lda;
%     pred_ds=cosmo_crossvalidation_measure(ds,opt);
%     to_show={pred_ds.samples,pred_ds.sa.targets,pred_ds.sa.folds};
%     cosmo_disp(to_show,'edgeitems',8)
%     %|| { [ 9         [ 7         [ 1
%     %||     8           8           1
%     %||     9           9           1
%     %||     7           7           1
%     %||     8           8           1
%     %||     9           9           1
%     %||     9           7           2
%     %||     8           8           2
%     %||     :           :           :
%     %||     8           8           5
%     %||     7           9           5
%     %||     7           7           6
%     %||     8           8           6
%     %||     9           9           6
%     %||     7           7           6
%     %||     9           8           6
%     %||     7 ]@36x1    9 ]@36x1    6 ]@36x1 }
%
% Notes:
%   - using this function, crossvalidation can be run using a searchlight
%
% See also: cosmo_searchlight, cosmo_average_samples
%
% #   For CoSMoMVPA's copyright information and license terms,   #
% #   see the COPYING file distributed with CoSMoMVPA.           #

    % deal with input arguments
    params=cosmo_structjoin('output','accuracy',... % default output
                            varargin); % use input arguments

    % Run cross validation to get the accuracy (see the help of
    % cosmo_crossvalidate)
    classifier=params.classifier;
    partitions=params.partitions;

    params=rmfield(params,'classifier');
    params=rmfield(params,'partitions');
    [pred,accuracy]=cosmo_crossvalidate(ds,classifier,...
                                            partitions,params);
    ds_sa=struct();
    output=params.output;

    switch output
        case 'accuracy'
            ds_sa.samples=accuracy;
            ds_sa.sa.labels={'accuracy'};

        case 'balanced_accuracy'
            ds_sa.samples=compute_balanced_accuracy(pred,ds.sa.targets);
            ds_sa.sa.labels={'balanced_accuracy'};


        case {'predictions','raw','winner_predictions'}
            if cosmo_match({output},{'predictions','raw'})
                cosmo_warning('CoSMoMVPA:deprecated',...
                        sprintf(...
                        ['Output option ''%s'' is deprecated and will '...
                        'be removed from a future release. Please use '...
                        'output=''winner_predictions'' instead, or use '...
                        'output=''fold_predictions'' to get '...
                        'predictions for each fold'],...
                            output));
            end

            ds_sa.samples=compute_winner_predictions(pred);
            ds_sa.sa=rmfield(ds.sa,'chunks');

        case 'fold_accuracy'
            [ds_sa.samples,ds_sa.sa.folds]=compute_fold_accuracy(pred,...
                                                        ds.sa.targets);

        case 'fold_predictions'
            ds_sa=struct();
            [ds_sa.samples,...
                    ds_sa.sa.folds,...
                    ds_sa.sa.targets]=compute_fold_predictions(pred,...
                                                        ds.sa.targets);


        case 'accuracy_by_chunk'
            error(['Output ''%s'' is not supported anymore. Consider '...
                    'using ''fold_predictions'' whenever this is '...
                    'implemented'], output);

        otherwise
            error('Illegal output parameter %s', params.output);
    end

function [accs,folds]=compute_fold_accuracy(pred,targets)
    is_correct=bsxfun(@eq,pred,targets);
    has_pred=~isnan(pred);

    accs=(sum(is_correct,1) ./ sum(has_pred,1))';
    folds=(1:numel(accs))';


function [fold_pred,folds,fold_targets]=compute_fold_predictions(pred,...
                                                                targets)
    nfolds=size(pred,2);

    pred_cell=cell(nfolds,1);
    fold_cell=cell(nfolds,1);
    target_cell=cell(nfolds,1);

    has_pred=~isnan(pred);
    fold_pred_count=sum(has_pred,1);

    for i_fold=1:nfolds
        msk=has_pred(:,i_fold);
        pred_cell{i_fold}=pred(msk,i_fold);
        fold_cell{i_fold}=zeros(fold_pred_count(i_fold),1)+i_fold;
        target_cell{i_fold}=targets(msk);
    end

    fold_pred=cat(1,pred_cell{:});
    folds=cat(1,fold_cell{:});
    fold_targets=cat(1,target_cell{:});

    2;


function winner_pred=compute_winner_predictions(pred)
    [winner_pred,classes]=cosmo_winner_indices(pred);
    has_pred=~isnan(winner_pred);
    winner_pred(has_pred)=classes(winner_pred(has_pred));



function acc=compute_balanced_accuracy(pred,targets)
    [class_idx,classes]=cosmo_index_unique(targets);
    nclasses=numel(classes);

    each_class_acc=zeros(nclasses,1);

    % some samples may be without predictions (and are set to NaN
    % by cosmo_crossvalidate). Consider only the samples with
    % predictions, and compute for each class the classification
    % accuracy
    has_prediction = ~isnan(pred);
    has_any_prediction=any(has_prediction,2);

    for c=1:nclasses
        idx=class_idx{c};
        keep_idx=idx(has_any_prediction(idx));

        n_in_class=numel(keep_idx);
        class_accs=NaN(n_in_class,1);
        for j=1:n_in_class;
            row=keep_idx(j);
            has_row_prediction=has_prediction(row,:);
            if ~any(has_row_prediction)
                continue;
            end

            is_correct=pred(row,:)==targets(row);
            class_accs(j)=sum(is_correct) / sum(has_row_prediction);
        end

        non_nan=~isnan(class_accs);
        each_class_acc(c)=sum(class_accs(non_nan))/sum(non_nan);
    end

    acc=mean(each_class_acc,1);