cosmo clusterize

function clusters=cosmo_clusterize(sample,nbrhood_mat)
% fast depth-first clustering based on equal values of neighbors
%
% clusters=cosmo_clusterize(ds,nbrhood,sizes)
%
% Inputs:
%   ds        A vector of size 1xN, or a dataset struct with a field
%             .samples of size 1xN.
%   nbrhood   PxN neighborhood definition, if each feature has P
%             neighbors at most. Values of zero can be used as
%             fillers if a feature has less than P neighbors.
%
% Output:
%   clusters  1xQ cell with cluster indices, if Q clusters are found.
%             C{k} denotes the indices in ds that belong to the k-th
%             cluster. Each value in C{k} is in the range 1:N.
%
% Example:
%     % generate tiny fmri dataset with 6 voxels
%     ds=cosmo_synthetic_dataset('ntargets',1,'nchunks',1);
%     % compute neighborhood over voxels, so that two voxels are neighbors
%     % if they share a vertex
%     nh=cosmo_cluster_neighborhood(ds,'progress',false);
%     %
%     % set the data to be, when unflatten, a 2D matrix:
%     % [-1  0 -1]
%     %  3  0 -1]
%     % with three clusters:
%     % - linear index 1         (matrix element (1,1)) with element -1
%     % - linear index 4         (matrix element (2,1)) with element 3
%     % - linear indices 3 and 6 (matrix elments (:,3)) with element -1
%     % Note that elements with indices:
%     % - 1 and 4 are not neighbors because they have different values
%     % - 1 and 3 are not neighbors because they are not connected, as the
%     % neighborhood definition
%     %
%     ds.samples=[2 0 -1 3 0 -1];
%     %
%     % convert neighborhood definition to matrix form
%     nh_mat=cosmo_convert_neighborhood(nh,'matrix');
%     %
%     % clusterize the data
%     cl=cosmo_clusterize(ds.samples,nh_mat);
%     %
%     % show cluster indices
%     cosmo_disp(cl)
%     %|| { [ 1 ]  [ 3    [ 4 ]
%     %||            6 ]        }
%
% Notes:
%   - Two features j and k are in the same cluster if:
%     * their values ds(j) and ds(k) are the same (ds(j)==ds(k)),
%     * features j and k are connected.
%     Features j and k are connected if:
%     * j==k, or
%     % j and k are neigbors, or
%     * there is a feature m, so that j and m are connected, and m and k
%       are connected
%   - Values of zero and NaN in ds are ignored for clusters.
%
% See also: cosmo_cluster_neighborhood, cosmo_convert_neighborhood
%
% #   For CoSMoMVPA's copyright information and license terms,   #
% #   see the COPYING file distributed with CoSMoMVPA.           #

    check_input_sizes(sample,nbrhood_mat);

    nfeatures=size(sample,2);
    [maxnbrs,nfeatures_matrix]=size(nbrhood_mat);

    assert(nfeatures==nfeatures_matrix);

    queue=zeros(nfeatures,1); % queue of nodes to add to current cluster
    queue_end=0; % last position where a value in q was stored
    queue_next=1; % first free position

    cl_start=zeros(nfeatures+1,1); % start index of i-th cluster.
                                  % one bigger for ease in setting output
    cl_count=0; % total number of clusters found so far
    cl_idxs=zeros(nfeatures,1); % label for each cluster
    in_queue=false(nfeatures,1); % indicates if an element is in the queue

    data_pos=1; % first non-visited node

    % go over all nodes from 1 to n. xpos is the candidate node for a new
    % cluster, and we add its neighbours in a breadth-first search
    while data_pos<=nfeatures
        if cl_idxs(data_pos)>0 % is already part of a cluster, continue
            data_pos=data_pos+1;
            continue;
        end

        data_val=sample(data_pos); % value of candidate cluster

        if data_val~=0 && ~isnan(data_val)
            % found an element for a new cluster

            queue(queue_next)=data_pos; % first element in the queue
            queue_end=queue_next; % last element of the queue
            cl_count=cl_count+1; % number of clusters
            cl_start(cl_count)=queue_next; % first index of this cluster
            cl_idx=cl_count; % index for current clustter

            in_queue(data_pos)=true;

            % now visit all neighbours of q(qnext),
            % iteratively, until the queue is exhausted
            while true
                qpos=queue(queue_next); % position in queue
                cl_idxs(qpos)=cl_idx; % assign cluster index

                for j=1:maxnbrs
                    nbr=nbrhood_mat(j,qpos);
                    % neighbour should have positive index, not queued yet,
                    % and have the same value as the first element of the
                    % current cluster
                    if nbr>0 && ~in_queue(nbr) && sample(nbr)==data_val
                        % add to queue of nodes to be visited
                        queue_end=queue_end+1;
                        queue(queue_end)=nbr; %

                        % mark it as queued, so it will not be added
                        % to the queue multiple times
                        in_queue(nbr)=true;
                    end
                end

                % get ready for next element in queue
                queue_next=queue_next+1;

                % if queue is exhausted, break out and find next cluster
                if queue_next>queue_end
                    break;
                end
            end
        end
        data_pos=data_pos+1;
    end

    % allocate space for output
    clusters=cell(1,cl_count); % cluster index

    % set extra boundary for last cluster
    cl_start(cl_count+1)=queue_end+1;

    % assign cluster index, size, and value
    for k=1:cl_count
        clusters{k}=queue(cl_start(k):(cl_start(k+1)-1));
    end



function check_input_sizes(sample,nbrhood_mat)
    if size(sample,1)~=1
        error('sample input must be a row vector');
    end

    if ~isnumeric(sample) && ~islogical(sample)
        error('sample input must be numeric or logical');
    end

    if numel(size(nbrhood_mat))~=2
        error('neighborhood matrix must be a matrix');
    end

    if ~isnumeric(nbrhood_mat)
        error('neighborhood matrix must be numeric');
    end

    % ensure they have matching size
    if numel(sample)~=size(nbrhood_mat,2)
        error(['size mismatch between data (%d elements) '...
                'and neighborhood (%d elements)'],...
                    numel(sample),size(nbrhood_mat,2));
    end