cosmo winner indices skl

function [winners, classes] = cosmo_winner_indices(pred)
    % Given multiple predictions, get indices that were predicted most often.
    %
    % [winners,classes]=cosmo_winner_indices(pred)
    %
    % Input:
    %   pred              PxQ prediction values for Q features and P
    %                     predictions per feature. Values of NaN are ignored,
    %                     i.e. can never be a winner.
    %
    % Output:
    %   winners           Px1 indices of classes that occur most often.
    %                     winners(k)==w means that no value in
    %                     classes(pred(k,:)) occurs more often than classes(w).
    %   classes           The sorted list of unique predicted values, across
    %                     all non-ignored (non-NaN) values in pred.
    %
    % Examples:
    %     % a single prediction, with the third one missing
    %     pred=[4; 4; NaN; 5];
    %     [p, c]=cosmo_winner_indices(pred);
    %     p'
    %     %|| [1 1 NaN 2]
    %     c'
    %     %|| [4, 5]
    %
    %     % one prediction per fold (e.g. using cosmo_nfold_partitioner)
    %     pred=[4 NaN NaN; 6 NaN NaN; NaN 3 NaN; NaN NaN NaN; NaN NaN 3];
    %     [p, c]=cosmo_winner_indices(pred);
    %     p'
    %     %|| [2, 3, 1, NaN, 1]
    %     c'
    %     %|| [3 4 6]
    %
    %     % given up to three predictions each for eight samples, compute
    %     % which predictions occur most often. NaNs are ignored.
    %     pred=[4 4 4;4 5 6;6 5 4;5 6 4;4 5 6; NaN NaN NaN; 6 0 0;0 0 NaN];
    %     [p, c]=cosmo_winner_indices(pred);
    %     p'
    %     %|| [2, 3, 4, 2, 3, NaN, 1, 1]
    %     c'
    %     %|| [0, 4, 5, 6]
    %
    % Notes:
    % - The typical use case is combining results from multiple classification
    %   predictions, such as in binary support vector machines (SVMs) and
    %   cosmo_crossvalidate
    % - The current implementation selects a winner pseudo-randomly (but
    %   deterministically) and (presumably) unbiased in case of a tie between
    %   multiple winners. That is, using the present implementation, repeatedly
    %   calling this function with identical input yields identical output,
    %   but unbiased with respect to which class is the 'winner' sample-wise.
    % - Samples with no predictions are assigned a value of NaN.
    %
    % See also: cosmo_classify_matlabsvm, cosmo_crossvalidate
    %
    % #   For CoSMoMVPA's copyright information and license terms,   #
    % #   see the COPYING file distributed with CoSMoMVPA.           #

    [nsamples, nfeatures] = size(pred);
    pred_msk = ~isnan(pred);

    % allocate space for output
    winners = NaN(nsamples, 1);

    if nfeatures == 1
        % single prediction, handle separately
        [classes, unused, pred_idxs] = unique(pred(pred_msk));
        winners(pred_msk) = pred_idxs;
        return
    end

    sample_pred_count = sum(pred_msk, 2);
    sample_pred_msk = sample_pred_count > 0;
    if max(sample_pred_count) <= 1
        % only one prediction per sample; set non-predictions to zero and
        % add them up to get the prediction
        pred(~pred_msk) = 0;
        pred_merged = sum(pred(sample_pred_msk, :), 2);

        [classes, unused, pred_idxs] = unique(pred_merged);

        winners(sample_pred_msk) = pred_idxs;
        return
    end

    classes = unique(pred(pred_msk));

    % see how often each index was predicted
    counts = histc(pred, classes, 2);

    [max_count, idx] = max(counts, [], 2);
    nwinners = sum(bsxfun(@eq, max_count, counts), 2);

    % deal with single winners
    single_winner_msk = nwinners == 1;
    winners(single_winner_msk) = idx(single_winner_msk);

    % remove the single winners from samples to consider
    sample_pred_msk(single_winner_msk) = false;

    seed = 0;
    for k = find(sample_pred_msk)'
        tied_idxs = find(counts(k, :) == max_count(k));
        ntied = numel(tied_idxs);
        seed = seed + 1;
        winners(k) = tied_idxs(mod(seed, ntied) + 1);
    end