cosmo anova feature selectorΒΆ

function selected_indices=cosmo_anova_feature_selector(dataset, how_many)
% find the features that show the most variance between classes
%
% selected_indices=cosmo_anova_feature_selector(dataset, how_many)
%
% Inputs:
%  dataset          struct with .samples and .sa.targets
%  how_many         value between 0 and 1 keeps how_many*100% features;
%                   values >=1 keeps how_many features
%
% Output:
%  selected_indices   feature ids in dataset with most variance between
%                     classes.
%
% Example:
%     ds=cosmo_synthetic_dataset();
%     disp(size(ds.samples))
%     %|| [ 6 6 ]
%     cosmo_anova_feature_selector(ds,.45) % find best ~45% of features
%     %|| [ 2 4 5 ]
%     cosmo_anova_feature_selector(ds,4) % find best 4 features
%     %|| [ 2 4 5 3 ]
%
% #   For CoSMoMVPA's copyright information and license terms,   #
% #   see the COPYING file distributed with CoSMoMVPA.           #

    fstat=cosmo_stat(dataset,'F');
    fvalues=fstat.samples;

    % ensure that nan values are not selected by setting them to
    % an impossible low F value
    fvalues(isnan(fvalues))=-1;

    % sort by F values, largest first
    [unused, idxs]=sort(fvalues,'descend');

    % determine features to select
    nfeatures=size(dataset.samples,2);

    if how_many>=1
        if round(how_many)~=how_many
            error('how_many>=1 is not an integer');
        elseif how_many>nfeatures
            error('dataset has %d features, cannot return %d',...
                    nfeatures,how_many);
        end
        nkeep=how_many;
    else
        nkeep=round(how_many*nfeatures);
    end

    selected_indices=idxs(1:nkeep);

    % throw an error if any indices with NaN F values
    if any(fvalues(selected_indices)<0)
        idx=find(fvalues(selected_indices)<0,1);
        error('Feature %d has NaN Fscore', selected_indices(idx));
    end