cosmo cluster neighborhood skl

function nbrhood=cosmo_cluster_neighborhood(ds,varargin)
% define neighborhood suitable for cluster-based analysis
%
% nbrhood=cosmo_cluster_neighborhood(ds,...)
%
% Inputs:
%     ds            dataset struct
%     'fmri',fnn    Optional connectivity for voxels, if ds is an
%                   fmri dataset. Use fnn=1, 2 or 3 to let two voxels be
%                   neighbors if they share at least a a side, edge
%                   or vertex (respectively).
%                   Default: 3
%     'source',snn  Optional connectivity for grid positions, if ds is an
%                   meeg source dataset. Use snn=1, 2 or 3 to let two
%                   source positions be neighbors if they share at least a,
%                   side, edge or vertex (respectively). If ds.fa has a
%                   field 'inside', then any feature that is false for
%                   ds.fa.inside will not have any neighbors, nor will it
%                   be the neighbor of any other feature.
%                   Default: 3
%     'chan',c      Optional connectivity for channels, if ds is an MEEG
%                   dataset with channels. c=true makes channels neighbors
%                   if their Delaunay triangulation (as computed by
%                   cosmo_meeg_chan_neighors) has them share an edge.
%                   c=false means that a channel is only neighbor of
%                   itself.
%                   Default: true
%     'surface',s   Optional connectivity for nodes, if ds is a
%                   surface-based dataset. s=true makes channels neighbors
%                   if they share an edge on the surface.
%                   c=false means that a channel is only neighbor of itself
%                   Default: true
%     'vertices',v  } vertices (Px3) and faces (Qx3) of the surface;
%     'faces',f     } only required for a surface-based dataset
%     k,t           Any other feature dimension and its connectivity, e.g.
%                   k can be 'chan' or 'freq'. t indicates whether
%                   neighboring points (in time, freq, ...) are neighbors.
%                   Default: true (for any dimension not mentioned above)
%
%  Output:
%     nbrhood       Neighborhood struct
%       .neighbors  .neighbors{k}==idxs means that feature k in ds has
%                   neighbors with feature indices idxs
%       .fa         identical to ds.fa, except that a field .sizes
%                   is added indicating the size of each feature
%                   - for surfaces, this is the area of each node
%                   - in all other cases, it is set to a vector of ones.
%       .a          identical to ds.a
%
%  Examples:
%     % get neighbors in very tiny synthetic fmri dataset (only 6 voxels)
%     ds_fmri=cosmo_synthetic_dataset('type','fmri');
%     % by default use NN=3 connectivity (voxels sharing a vertex is
%     % sufficient to be neighbors)
%     nh_fmri=cosmo_cluster_neighborhood(ds_fmri,'progress',false);
%     % each voxel has 4 or 6 neighbors
%     cosmo_disp(nh_fmri.neighbors);
%     %|| { [ 1         2         4         5 ]
%     %||   [ 1         2         3         4         5         6 ]
%     %||   [ 2         3         5         6 ]
%     %||   [ 1         2         4         5 ]
%     %||   [ 1         2         3         4         5         6 ]
%     %||   [ 2         3         5         6 ]                     }
%
%     % (This example requires FieldTrip)
%     cosmo_skip_test_if_no_external('fieldtrip');
%     %
%     % get neighbors in time-lock MEEG dataset from the neuromag306
%     % system (subset of channels), with combined planar and
%     % axial channels
%     ds_meg=cosmo_synthetic_dataset('type','meeg',...
%                          'size','normal',...
%                          'sens','neuromag306_planar_combined+axial');
%     nh_meg=cosmo_cluster_neighborhood(ds_meg,'progress',false);
%     % neighbors are seperate for axial channels (odd features)
%     % and planar_combined channels (even features)
%     cosmo_disp(nh_meg.neighbors)
%     %|| { [ 1         3         4         6 ]
%     %||   [ 2         5 ]
%     %||   [ 1         3         4         6 ]
%     %||   [ 1         3         4         6 ]
%     %||   [ 2         5 ]
%     %||   [ 1         3         4         6 ] }
%
%     % (This example requires FieldTrip)
%     cosmo_skip_test_if_no_external('fieldtrip');
%     %
%     % get neighbors in EEG dataset, either with clustering over all
%     % feature dimensions (channels x time x freq) or with all feature
%     % dimensions except for channels (i.e., time x freq)
%     ds_eeg=cosmo_synthetic_dataset('type','timefreq',...
%                               'size','normal',...
%                               'sens','eeg1005');
%     % neighborhood with clustering over chan x time x freq
%     nh_eeg_full=cosmo_cluster_neighborhood(ds_eeg,'progress',false);
%     % each feature has up to 18 neighbors
%     cosmo_disp(nh_eeg_full.neighbors)
%     %|| { [ 1         2         3  ...  10       11       12 ]@1x12
%     %||   [ 1         2         3  ...  10       11       12 ]@1x12
%     %||   [ 1         2         3  ...  10       11       12 ]@1x12
%     %||                                :
%     %||   [ 19        20        21  ...  28       29       30 ]@1x12
%     %||   [ 19        20        21  ...  28       29       30 ]@1x12
%     %||   [ 19        20        21  ...  28       29       30 ]@1x12 }@30x1
%     %
%     % neighborhood with clustering over time x freq (not over chan)
%     nh_eeg_tf=cosmo_cluster_neighborhood(ds_eeg,'progress',false,...
%                                                 'chan',false);
%     % each feature has at most 6 neighbors
%     cosmo_disp(nh_eeg_tf.neighbors)
%     %|| { [ 1         4         7        10 ]
%     %||   [ 2         5         8        11 ]
%     %||   [ 3         6         9        12 ]
%     %||                    :
%     %||   [ 19        22        25        28 ]
%     %||   [ 20        23        26        29 ]
%     %||   [ 21        24        27        30 ] }@30x1
%
%
% Notes:
%   - This function uses cosmo_cross_neighborhoods internally so that
%     clusters can be formed across different dimensions.
%   - The output from this function can be used with
%     cosmo_montecarlo_cluster_stat for multiple comparison correction
%   - each dimension argument (such as 'fmri', 'source', 'chan', 'freq',
%     'time', 'surface') can be followed by a custom neighborhood, if it is
%     desired to override the neighborhoods generated by this function.
%     For most use cases this is recommended; it should only be used if you
%     really know what you are doing.
%   - To avoid making clusters along a particular dimension d, use
%     cosmo_cluster_neighborhood (...,d,false). For example, an MEEG
%     time-by-chan dataset ds could be clustered using:
%         1a) nh1a=cosmo_cluster_neighborhood(ds);
%         1b) nh1b=cosmo_cluster_neighborhood(ds,'time',true);
%         2)  nh2 =cosmo_cluster_neighborhood(ds,'time',false);
%     where 1a and 1b are equivalent, and both different from 2.
%     When used with cosmo_montecarlo_cluster_stat to correct for multiple
%     comparisons, the interpretation of any surviving cluster depends on
%     which approach was used:
%     * In approach 2, clusters are not connected by neighboring time
%        points; each cluster spans a single time point. For example, if a
%        cluster is  found at 200ms relative to stimulus onset, one can
%        infer a significant effect at 200 ms.
%     * In approach 1, clusters are connected over neighboring time points,
%       and each cluster can span multiple time points. If a cluster is
%       found spanning a time interval between 200 and 300ms, one *cannot*
%       infer a significant effect at 200 ms. One *can* infer a significant
%       effect for the time interval between 200 and 300 ms.
%     (Note that in both approaches clusters, are connected over
%     neighboring channels; spatial inferences over individual channels
%     cannot be made in either approach).
%     To summarize: in approach 1, the threshold to pass significance is
%     lower, but less strong inferences can be made than with approach 2.
%
% See also: cosmo_montecarlo_cluster_stat
%
% #   For CoSMoMVPA's copyright information and license terms,   #
% #   see the COPYING file distributed with CoSMoMVPA.           #

    % set defaults
    default=struct();
    default.progress=1000;
    opt=cosmo_structjoin(default,varargin{:});

    % check dataset
    cosmo_check_dataset(ds);

    % get neighborhoods for each individual dimension
    nbrhoods=get_dimension_neighborhoods(ds,opt);

    % cross the neighborhoods to get full neighborhood
    full_nbrhood=cosmo_cross_neighborhood(ds,nbrhoods,opt);

    % determine mapping from all feature attributes to only those in the
    % dataset
    ds2nbrhood=get_dataset2neighborhood_mapping(ds, full_nbrhood);

    % slice nbrhood to match feature attributes
    nbrhood=struct();
    nbrhood.neighbors=full_nbrhood.neighbors(ds2nbrhood);
    nbrhood.fa=cosmo_slice(full_nbrhood.fa,ds2nbrhood,2,'struct');
    nbrhood.a=full_nbrhood.a;

    origin=struct();
    origin.a=ds.a;
    origin.fa=ds.fa;
    nbrhood.origin=origin;

    nbrhood=set_feature_sizes(nbrhood);
    cosmo_check_neighborhood(nbrhood,ds);



function check_matching_fa(ds,nbrhood)
    % ensure that all dimension values in fa match between ds and nbrhood
    labels=ds.a.fdim.labels;
    for k=1:numel(labels)
        label=labels{k};
        assert(isequal(ds.fa.(label),nbrhood.fa.(label)));
    end

function nbrhood=set_feature_sizes(nbrhood)
    % set feature sizes if not set
    if ~isfield(nbrhood.fa,'sizes')
        nfeatures=numel(nbrhood.neighbors);
        nbrhood.fa.sizes=ones(1,nfeatures);
    end

function ds2nbrhood=get_dataset2neighborhood_mapping(ds, nbrhood)
    % ensure matching .fa for ds and nbrhood
    assert(isequal(nbrhood.a.fdim.labels(:),ds.a.fdim.labels(:)));

    dim_labels=ds.a.fdim.labels;
    dim_sizes=cellfun(@numel,ds.a.fdim.values(:))';
    ndim=numel(dim_labels);

    % store fa indices
    ds_fa=cell(1,ndim);
    nbrhood_fa=cell(1,ndim);
    for dim=1:ndim
        ds_fa{dim}=ds.fa.(dim_labels{dim});
        nbrhood_fa{dim}=nbrhood.fa.(dim_labels{dim});
    end

    % convert to linear indices
    if ndim==1
        % trivial case
        ds_lin=ds_fa{1};
        nbrhood_lin=nbrhood_fa{1};
    else
        ds_lin=sub2ind(dim_sizes, ds_fa{:});
        nbrhood_lin=sub2ind(dim_sizes, nbrhood_fa{:});
    end

    % total number of features in cross product
    n=prod(dim_sizes);

    % mapping from all features to those in nbrhood
    full2nbrhood=zeros(n,1);
    full2nbrhood(nbrhood_lin)=1:numel(nbrhood_lin);

    % ensure that nbrhood has no features not in ds
    ds2nbrhood=full2nbrhood(ds_lin);
    if any(ds2nbrhood==0)
        idx=find(ds2nbrhood==0,1);
        error('Missing neighborhood in dataset feature #%d',idx);
    end


function nbrhoods=get_dimension_neighborhoods(ds,opt)
    % get neighborhoods for each feature dimension

    dim_labels=ds.a.fdim.labels;
    n=numel(dim_labels);

    % keep track of which dimension labels have a neighborhood associated
    % with them
    visited=false(n,1);
    nbrhoods=cell(n,1);

    while true
        i=find(~visited,1);

        if isempty(i)
            % all done
            break;
        end

        dim_label=dim_labels{i};
        other_dim_labels={};
        switch dim_label
            case 'i'
                dim_type='fmri';
                neighborhood_func=@fmri_neighborhood;
                other_dim_labels={'j','k'};
            case 'pos'
                dim_type='source';
                neighborhood_func=@source_neighborhood;
            case 'node_indices'
                dim_type='surface';
                neighborhood_func=@surface_neighborhood;
            case 'chan'
                dim_type='chan';
                neighborhood_func=@chan_neighborhood;

            case 'mom'
                dim_type='mom';
                neighborhood_func=@mom_neighborhood;

            otherwise
                % typically for 'time' and 'freq'
                dim_type=dim_label;
                neighborhood_func=@interval_neighborhood;
        end

        % see if option is specified for this dimension label
        has_dim_opt=isfield(opt,dim_type);
        if has_dim_opt
            dim_opt=opt.(dim_type);
        else
            dim_opt=[];
        end

        if isstruct(dim_opt)
            % neighborhood struct, use directly
            cosmo_check_neighborhood(dim_opt,ds);
            nbrhood=dim_opt;
        else
            % compute neighborhood
            radius=dim_opt;
            if ~(isempty(radius) || (isscalar(radius) && radius>=0))
                error('radius for %s must be non-negative scalar',...
                        dim_type);
            end
            nbrhood=neighborhood_func(ds,i,radius,opt);
        end

        nbrhoods{i}=nbrhood;

        visited_dim_labels=[{dim_label} other_dim_labels];
        visited(cosmo_match(dim_labels,visited_dim_labels))=true;
    end

    msk=~cellfun(@isempty,nbrhoods);
    nbrhoods=nbrhoods(msk);


function nbrhood=fmri_neighborhood(ds,dim_pos,connectivity,opt)
    labels={'i','j','k'};
    nbrhood=spherical_neighborhood(ds,dim_pos,connectivity,labels,opt);


function nbrhood=source_neighborhood(ds,dim_pos,connectivity,opt)
    labels={'pos'};
    nbrhood=spherical_neighborhood(ds,dim_pos,connectivity,labels,opt);

function nbrhood=mom_neighborhood(ds,dim_pos,connectivity,opt)
    dim_values=ds.a.fdim.values{dim_pos};
    if numel(dim_values)~=1
        error(['When using a dataset with a .mom field, it can only '...
                'have a single value, because clustering for '...
                'datasets with multiple orientations of dipole '...
                'fields is (to the best knowledge of the CoSMoMVPA '...
                'developers) not properly defined. (If you have an '...
                'idea how such clustering can be defined meaningfully, '...
                'please do not hesitate to contact them']);
    end
    nbrhood=cosmo_interval_neighborhood(ds,'mom','radius',1);


function nbrhood=spherical_neighborhood(ds,dim_pos,connectivity,labels,opt)
    % for fmri and meeg-source datasets
    % labels is either {'i','j','k'} or {'pos'}
    if isempty(connectivity)
        connectivity=3; % NN=3 connectivity
    end

    if ~isscalar(connectivity) || ~isnumeric(connectivity) || ...
                ~any(connectivity==1:3)
        error('argument for connectivity must be 1, 2 or 3');
    end

    radius=sqrt(connectivity)+.001;

    dim_labels=ds.a.fdim.labels;
    nlabels=numel(dim_labels);

    label_pos=dim_pos+(0:numel(labels)-1)';

    if numel(dim_labels)>(dim_pos+nlabels-1) || ...
                ~isequal(dim_labels(label_pos),labels(:))
        error(['expected dataset with .a.fdim.labels([%d:%d])='...
                '{''%s''}. \n'...
                '- If this is an fMRI dataset or source dataset, it '...
                'seems messed up\n'...
                '- Otherwise, ''%s'' is an illegal a dimension label'],...
                dim_pos, dim_pos+nlabels-1,...
                cosmo_strjoin(labels,''', '''),...
                dim_labels{1});
    end


    nbrhood=cosmo_spherical_neighborhood(ds,'radius',radius,opt);

    keep_labels=[labels {'inside'}];

    % only keep feature attributes in labels
    remove_fa_keys=setdiff(fieldnames(nbrhood.fa),keep_labels);
    nbrhood.fa=rmfield(nbrhood.fa,remove_fa_keys);


function nbrhood=chan_neighborhood(ds,dim_pos,arg,unused)
    do_connect=get_default_connect(arg);

    assert(isequal(ds.a.fdim.labels{dim_pos},'chan'));
    nbrhood=cosmo_meeg_chan_neighborhood(ds,'chantype','all',...
                                            'label','dataset',...
                                            'delaunay',do_connect);


function nbrhood=surface_neighborhood(ds,dim_pos,arg,opt)
    do_connect=get_default_connect(arg,'surface');

    assert(isequal(ds.a.fdim.labels{dim_pos},'node_indices'));

    if ~all(cosmo_isfield(opt,{'vertices','faces'}))
        error(['for surfaces, use the syntax\n\n',...
                '  %s(''vertices'',v,''faces'',f)\n' ,...
                'to specify the coordinates v and the faces f'],...
                mfilename());
    end

    surf_def={opt.vertices,opt.faces};
    cosmo_check_external('surfing');
    nbrhood=cosmo_surficial_neighborhood(ds,surf_def,...
                                    'direct',do_connect,...
                                    'metric','dijkstra',opt);


    node_area_surf=surfing_surfacearea(opt.vertices,opt.faces);
    feature_ids=ds.a.fdim.values{dim_pos}(nbrhood.fa.node_indices);
    node_area_ds=node_area_surf(feature_ids);

    nbrhood.fa.sizes=node_area_ds';


function nbrhood=interval_neighborhood(ds,dim_pos,arg,unused)
    dim_label=ds.a.fdim.labels{dim_pos};
    do_connect=get_default_connect(arg,dim_label);

    nbrhood=cosmo_interval_neighborhood(ds,dim_label,'radius',do_connect);


function do_connect=get_default_connect(arg, label)
    % helper, returns true by default. requires arg to be logical to
    % override
    if isempty(arg)
        % delaunay
        do_connect=true;
    else
        if ~islogical(arg)
            error('argument for ''%s'' must be logical', label);
        end
        do_connect=arg;
    end