cosmo clusterize skl

function clusters = cosmo_clusterize(sample, nbrhood_mat)
    % fast depth-first clustering based on equal values of neighbors
    %
    % clusters=cosmo_clusterize(ds,nbrhood,sizes)
    %
    % Inputs:
    %   ds        A vector of size 1xN, or a dataset struct with a field
    %             .samples of size 1xN.
    %   nbrhood   PxN neighborhood definition, if each feature has P
    %             neighbors at most. Values of zero can be used as
    %             fillers if a feature has less than P neighbors.
    %
    % Output:
    %   clusters  1xQ cell with cluster indices, if Q clusters are found.
    %             C{k} denotes the indices in ds that belong to the k-th
    %             cluster. Each value in C{k} is in the range 1:N.
    %
    % Example:
    %     % generate tiny fmri dataset with 6 voxels
    %     ds=cosmo_synthetic_dataset('ntargets',1,'nchunks',1);
    %     % compute neighborhood over voxels, so that two voxels are neighbors
    %     % if they share a vertex
    %     nh=cosmo_cluster_neighborhood(ds,'progress',false);
    %     %
    %     % set the data to be, when unflatten, a 2D matrix:
    %     % [-1  0 -1]
    %     %  3  0 -1]
    %     % with three clusters:
    %     % - linear index 1         (matrix element (1,1)) with element -1
    %     % - linear index 4         (matrix element (2,1)) with element 3
    %     % - linear indices 3 and 6 (matrix elements (:,3)) with element -1
    %     % Note that elements with indices:
    %     % - 1 and 4 are not neighbors because they have different values
    %     % - 1 and 3 are not neighbors because they are not connected, as the
    %     % neighborhood definition
    %     %
    %     ds.samples=[2 0 -1 3 0 -1];
    %     %
    %     % convert neighborhood definition to matrix form
    %     nh_mat=cosmo_convert_neighborhood(nh,'matrix');
    %     %
    %     % clusterize the data
    %     cl=cosmo_clusterize(ds.samples,nh_mat);
    %     %
    %     % show cluster indices
    %     cosmo_disp(cl)
    %     %|| { [ 1 ]  [ 3    [ 4 ]
    %     %||            6 ]        }
    %
    % Notes:
    %   - Two features j and k are in the same cluster if:
    %     * their values ds(j) and ds(k) are the same (ds(j)==ds(k)),
    %     * features j and k are connected.
    %     Features j and k are connected if:
    %     * j==k, or
    %     % j and k are neighbors, or
    %     * there is a feature m, so that j and m are connected, and m and k
    %       are connected
    %   - Values of zero and NaN in ds are ignored for clusters.
    %
    % See also: cosmo_cluster_neighborhood, cosmo_convert_neighborhood
    %
    % #   For CoSMoMVPA's copyright information and license terms,   #
    % #   see the COPYING file distributed with CoSMoMVPA.           #

    check_input_sizes(sample, nbrhood_mat);

    nfeatures = size(sample, 2);
    [maxnbrs, nfeatures_matrix] = size(nbrhood_mat);

    assert(nfeatures == nfeatures_matrix);

    queue = zeros(nfeatures, 1); % queue of nodes to add to current cluster
    queue_end = 0; % last position where a value in q was stored
    queue_next = 1; % first free position

    cl_start = zeros(nfeatures + 1, 1); % start index of i-th cluster.
    % one bigger for ease in setting output
    cl_count = 0; % total number of clusters found so far
    cl_idxs = zeros(nfeatures, 1); % label for each cluster
    in_queue = false(nfeatures, 1); % indicates if an element is in the queue

    data_pos = 1; % first non-visited node

    % go over all nodes from 1 to n. xpos is the candidate node for a new
    % cluster, and we add its neighbours in a breadth-first search
    while data_pos <= nfeatures
        if cl_idxs(data_pos) > 0 % is already part of a cluster, continue
            data_pos = data_pos + 1;
            continue
        end

        data_val = sample(data_pos); % value of candidate cluster

        if data_val ~= 0 && ~isnan(data_val)
            % found an element for a new cluster

            queue(queue_next) = data_pos; % first element in the queue
            queue_end = queue_next; % last element of the queue
            cl_count = cl_count + 1; % number of clusters
            cl_start(cl_count) = queue_next; % first index of this cluster
            cl_idx = cl_count; % index for current clustter

            in_queue(data_pos) = true;

            % now visit all neighbours of q(qnext),
            % iteratively, until the queue is exhausted
            while true
                qpos = queue(queue_next); % position in queue
                cl_idxs(qpos) = cl_idx; % assign cluster index

                for j = 1:maxnbrs
                    nbr = nbrhood_mat(j, qpos);
                    % neighbour should have positive index, not queued yet,
                    % and have the same value as the first element of the
                    % current cluster
                    if nbr > 0 && ~in_queue(nbr) && sample(nbr) == data_val
                        % add to queue of nodes to be visited
                        queue_end = queue_end + 1;
                        queue(queue_end) = nbr; %

                        % mark it as queued, so it will not be added
                        % to the queue multiple times
                        in_queue(nbr) = true;
                    end
                end

                % get ready for next element in queue
                queue_next = queue_next + 1;

                % if queue is exhausted, break out and find next cluster
                if queue_next > queue_end
                    break
                end
            end
        end
        data_pos = data_pos + 1;
    end

    % allocate space for output
    clusters = cell(1, cl_count); % cluster index

    % set extra boundary for last cluster
    cl_start(cl_count + 1) = queue_end + 1;

    % assign cluster index, size, and value
    for k = 1:cl_count
        clusters{k} = queue(cl_start(k):(cl_start(k + 1) - 1));
    end

function check_input_sizes(sample, nbrhood_mat)
    if size(sample, 1) ~= 1
        error('sample input must be a row vector');
    end

    if ~isnumeric(sample) && ~islogical(sample)
        error('sample input must be numeric or logical');
    end

    if numel(size(nbrhood_mat)) ~= 2
        error('neighborhood matrix must be a matrix');
    end

    if ~isnumeric(nbrhood_mat)
        error('neighborhood matrix must be numeric');
    end

    % ensure they have matching size
    if numel(sample) ~= size(nbrhood_mat, 2)
        error(['size mismatch between data (%d elements) '...
               'and neighborhood (%d elements)'], ...
              numel(sample), size(nbrhood_mat, 2));
    end