cosmo measure clusters skl

function cval = cosmo_measure_clusters(sample, nbrhood_mat, cluster_stat, varargin)
    % General cluster measure
    %
    % cval=cosmo_measure_clusters(sample,nbrhood_mat,method,['dh',dh|'threshold',t],...)
    %
    % Inputs:
    %   sample              1xP data array for k features
    %   nbrhood_mat         MxP neighborhood matrix, if each feature has at
    %                       most M neighbors. nbrhood_mx(:,k) should contain
    %                       the feature ids of the neighbors of feature k
    %                       (values of zero indicate no neighbors)
    %   cluster_stat        'tfce', 'max', 'maxsize', or 'maxsum'.
    %   'dh',dh             when method is 'tfce', the integral step size
    %   'threshold',t       if method is anything but 'tfce', the threshold
    %                       level
    %   'E', E              (optional) when method is 'tfce': the extent
    %                       exponent, default E=0.5
    %   'H', E              (optional) when method is 'tfce': the height
    %                       exponent, default H=2
    %   'feature_sizes', s  (optional) 1xP element size of each feature;
    %                       default is a vector with all elements equal to 1.
    %
    % Output:
    %   cval           1xP data array with cluster values.
    %                  Define:
    %                     cl_k(thr)       Feature ids that are connected
    %                                     with feature k when sample is
    %                                     thresholded at thr.
    %                                     (with cl_k(thr)=[] if sample(k)<thr).
    %                     e_k(thr)        The extent of the cluster for feature
    %                                     k when thresholded at thr, i.e.
    %                                       e_k=sum[i=cl_k(thr)] s(i)
    %                     max0[x=xs]v(x)  the maximum value that v takes for
    %                                     any value in xs, or zero if xs is
    %                                     empty.
    %                  The output for a feature k depends on cluster_stat:
    %                  - 'tfce':
    %                       cval(k)=sum[h=dh:dh:max(sample)] e_k(h)^E *h^H * dh
    %                    i.e. a weighted sum of the extent and height of
    %                    the cluster at different thresholds
    %                  - 'max':
    %                       cval(k)=max0[i=cl_k(thr)] samples(i))
    %                    i.e. the maximum value in the cluster that contains k
    %                  - 'maxsize'
    %                       cval(k)=e_k(thr)
    %                    i.e. the size of the cluster that contains k
    %                  - 'maxsum'
    %                       cval(k)=sum[i=cl_k(thr)] samples(i)
    %                    i.e. the sum of the values in the cluster that
    %                    contains k
    %
    % Examples:
    %     % very simple one-dimensional 'line' example
    %     samples=     [1 2 1 1 0 2 2];
    %     nbrhood_mat= [1 1 2 3 4 5 6;...
    %                   2 2 3 4 5 6 7;...
    %                   0 3 4 5 6 7 0];
    %     % illustrate TFCE cluster stat. Note that the results vary little
    %     % as a function of dh, and that both features in clusters with a
    %     % larger extent but less extreme value and features in clusters with
    %     % smaller extent but more extreme values have larger values
    %     s=cosmo_measure_clusters(samples,nbrhood_mat,'tfce','dh',.1);
    %     cosmo_disp(s,'threshold',inf)
    %     %|| [ 0.77  2.86  0.77  0.77     0  3.49  3.49 ]
    %     %
    %     s=cosmo_measure_clusters(samples,nbrhood_mat,'tfce','dh',.05);
    %     cosmo_disp(s,'threshold',inf)
    %     %|| [ 0.618  2.88  0.618  0.618     0  3.63  3.63 ]
    %     %
    %     % illustrate other cluster stats. Note that the results varies more
    %     % as a function of the chosen threshold
    %     cosmo_measure_clusters(samples,nbrhood_mat,'max','threshold',1)
    %     %||  2     2     2     2     0     2     2
    %     %
    %     cosmo_measure_clusters(samples,nbrhood_mat,'max','threshold',2)
    %     %||  0     2     0     0     0     2     2
    %     %
    %     cosmo_measure_clusters(samples,nbrhood_mat,'maxsize','threshold',1)
    %     %||  4     4     4     4     0     2     2
    %     %
    %     cosmo_measure_clusters(samples,nbrhood_mat,'maxsize','threshold',2)
    %     %||  0     1     0     0     0     2     2
    %     %
    %     cosmo_measure_clusters(samples,nbrhood_mat,'maxsum','threshold',1)
    %     %||  5     5     5     5     0     4     4
    %     %
    %     cosmo_measure_clusters(samples,nbrhood_mat,'maxsum','threshold',2)
    %     %||  0     2     0     0     0     4     4
    %
    % Notes:
    %   - the 'method' argument is similar to FieldTrip's clusterstat method,
    %     but does support 'tfce' and does not support 'wcm'
    %   - unlike FieldTrip's clusterstat, a value is returned for each feature
    %     (rather than each cluster).
    %   - TFCE is advised in the general case, because it finds a compromise
    %     between magnitude of values and extent of clusters.
    %   - this function is used by cosmo_montecarlo_cluster_stat for
    %     non-parametric cluster-based correction for multiple comparisons.
    %
    % References:
    %   - Stephen M. Smith, Thomas E. Nichols (2009), Threshold-free
    %     cluster enhancement: Addressing problems of smoothing, threshold
    %     dependence and localisation in cluster inference, NeuroImage, Volume
    %     44, 83-98.
    %   - Maris, E., Oostenveld, R. Nonparametric statistical testing of EEG-
    %     and MEG-data. Journal of Neuroscience Methods (2007).
    %
    % See also: cosmo_convert_neighborhood, cosmo_montecarlo_cluster_stat
    %
    % #   For CoSMoMVPA's copyright information and license terms,   #
    % #   see the COPYING file distributed with CoSMoMVPA.           #

    persistent cached_varargin
    persistent cached_opt

    if isequal(varargin, cached_varargin)
        opt = cached_opt;
    else
        opt = cosmo_structjoin(varargin);
        cached_opt = opt;
        cached_varargin = varargin;
    end

    % used for TFCE
    defaults = struct();
    defaults.E = .5;
    defaults.H = 2;

    % get size of each feature
    feature_sizes = get_feature_sizes(sample, opt);

    % check input size
    check_input_sizes(sample, nbrhood_mat);

    % set cluster parameters depending on method; outputs are:
    %   delta_thr      value by which the threshold is increased for every
    %                  iteration (in case of TFCE), or the threshold value
    %                  itself (for any other method)
    %   cluster_func   function handle which takes as inputs:
    %                    idxs       feature ids of elements in a cluster
    %                    sizes      size of each feature; same size as data
    %                    thr        threshold value
    %   nthr           number of thresholds (1 for any method except for
    %                  TFCE)

    [delta_thr, cluster_func, nthr, has_inf] = get_clustering_params( ...
                                                                     cluster_stat, ...
                                                                     sample, ...
                                                                     feature_sizes, ...
                                                                     opt, defaults);

    % initially all features survive the mask, except non-finite ones
    super_threshold_msk = ~isnan(sample);
    infinity_msk = isinf(sample) & sample > 0;

    % keep
    thr_counter = 0;

    % cluster threshold variable. For TFCE this value is increased multiple
    % times in the while-loop below. For all other methods it is increased
    % just once
    thr = 0;

    % allocate space for output
    cval = zeros(size(sample));

    while thr_counter < nthr
        % increase threshold
        thr = thr + delta_thr;

        % update mask of features above threshold
        super_threshold_msk = super_threshold_msk & sample >= thr;

        if has_inf
            % if output can contain infinity, break if all values are less
            % than the threshold or infinity
            if ~any(super_threshold_msk & ~infinity_msk)
                % no features surive, clustering is completed
                break
            end
        else
            % if output cannot contain infinity, break if all values are
            % less than the threshold
            if ~any(super_threshold_msk)
                break
            end
        end

        % apply clustering
        clusters = cosmo_clusterize(super_threshold_msk, nbrhood_mat);

        % process each cluster
        for k = 1:numel(clusters)
            cluster = clusters{k};
            stat = cluster_func(cluster, thr);

            cval(cluster) = cval(cluster) + stat;
        end

        % increase the threshold counter. For all methods except tfce, the
        % while loop is quit after the first iteration
        thr_counter = thr_counter + 1;
    end

    if has_inf
        cval(infinity_msk) = Inf;
    end

function [delta_thr, cluster_func, nthr, has_inf] = get_clustering_params( ...
                                                                          method, ...
                                                                          sample, sizes, ...
                                                                          opt, defaults)
    switch method
        case 'tfce'
            delta_thr = get_tfce_threshold_height(opt);
            cluster_func = get_tfce_cluster_func(sizes, delta_thr, ...
                                                 opt, defaults);
            nthr = Inf;
            has_inf = true;

        case 'max'
            cluster_func = @(idxs, thr) max(sample(idxs));
            delta_thr = get_fixed_threshold_height(opt);
            nthr = 1;
            has_inf = true;

        case 'maxsize'
            cluster_func = @(idxs, thr) sum(sizes(idxs));
            delta_thr = get_fixed_threshold_height(opt);
            nthr = 1;
            has_inf = false;

        case 'maxsum'
            cluster_func = @(idxs, thr) sum(sample(idxs));
            delta_thr = get_fixed_threshold_height(opt);
            nthr = 1;
            has_inf = true;

        otherwise
            error('illegal method ''%s''', method);
    end

function feature_sizes = get_feature_sizes(sample, opt)
    nfeatures = size(sample, 2);

    if isfield(opt, 'feature_sizes')
        % if given as option, use those
        feature_sizes = opt.feature_sizes;
        % ensure it is a row vector
        if ~isequal(size(feature_sizes), [1, nfeatures])
            error('feature sizes must be of size 1x%d', nfeatures);
        end
    else
        feature_sizes = ones(1, nfeatures);
    end

function func = get_tfce_cluster_func(sizes, threshold_height, opt, defaults)
    % helper to get the cluster function for TFCE, using either supplied or
    % default values for the E and H parameters
    persistent cached_opt
    persistent cached_defaults
    persistent cached_opt_all

    if isequal(opt, cached_opt) && isequal(defaults, cached_defaults)
        opt_all = cached_opt_all;
    else
        opt_all = cosmo_structjoin(defaults, opt);
        cached_opt_all = opt_all;

        cached_opt = opt;
        cached_defaults = defaults;
    end

    tfce_E = opt_all.E;
    tfce_H = opt_all.H;

    check_positive_scalar(tfce_E, 'E');
    check_positive_scalar(tfce_H, 'H');

    func = @(idxs, thr) sum(sizes(idxs))^tfce_E ...
                                          * thr^tfce_H * threshold_height;

function dh = get_fixed_threshold_height(opt)
    % for everything except TFCE
    if ~isfield(opt, 'threshold')
        error('missing option ''threshold''');
    end

    if any(cosmo_isfield(opt, {'E', 'B', 'dh'}))
        error(['options ''dh'', ''E'' or ''B'' are not allowed '...
               'for this method']);
    end

    dh = opt.threshold;
    if ~isscalar(dh) || ~isfinite(dh)
        error('option ''threshold'' must be a finite scalar');
    end

function dh = get_tfce_threshold_height(opt)
    if isfield(opt, 'threshold')
        error('option ''threshold'' is not allowed for this method');
    end

    if ~isfield(opt, 'dh')
        error('missing option ''dh'' for ''tfce'' method');
    end

    dh = opt.dh;
    check_positive_scalar(dh, 'dh');

function check_positive_scalar(value, name)
    if ~isscalar(value) || ~isfinite(value) || value <= 0
        error('option ''%s'' must be a positive scalar', name);
    end

function check_input_sizes(sample, nbrhood_mx)
    if size(sample, 1) ~= 1
        error('sample input must be a row vector');
    end

    if ~isnumeric(sample) && ~islogical(sample)
        error('sample input must be numeric or logical');
    end

    if numel(size(nbrhood_mx)) ~= 2
        error(['neighborhood matrix must be a matrix '...
               '(use cosmo_convert_neighborhood to convert a '...
               'neighborhood struct to matrix representation']);
    end

    if ~isnumeric(nbrhood_mx)
        error('neighborhood matrix must be numeric');
    end

    % ensure they have matching size
    if numel(sample) ~= size(nbrhood_mx, 2)
        error(['size mismatch between data (%d elements) '...
               'and neighborhood (%d elements)'], ...
              numel(sample), size(nbrhood_mx, 2));
    end