cosmo neighborhood split

function [nh_split, masks] = cosmo_neighborhood_split(nh, varargin)
    % partitions a neighborhood in a cell with (smaller) neigborhoods
    %
    % cosmo_neighborhood_split(nh, ...)
    %
    % Inputs:
    %   nh                  neighborhood struct with fields .neighbors and
    %                       .origin
    %   'divisions', d      Number of divisions along each feature dimension.
    %                       For example, if d=4 for an fMRI dataset (with
    %                       three feature dimensions corresponding to the three
    %                       spatial dimensions), then the output will have
    %                       at most 4^3=64 neighborhood structs and masks.
    %
    % Output:
    %   nh_split            Cell with neighborhood structs. When all elements
    %                       in nh_split are combined they should contain the
    %                       same information in .neighbors and .fa as in the
    %                       input nh, but with a re-indexing of the feature ids
    %                       corresponding to the masks.
    %   masks               Cell with mask datasets, corresponding to the
    %                       features returned in neighborhoods in nh_split.
    %                       When the dataset derived from the input
    %                       neighborhood nh is masked by the k-th mask, then
    %                       the indices in the resulting dataset match the
    %                       indices in .neighbors
    %
    %
    % Example:
    %     % (This example cannot be documentation tested using Octave,
    %     %  since Octave does not allow for-loops with evalc)
    %     cosmo_skip_test_if_no_external('matlab');
    %     %
    %     % the following example shows how a neighborhood can be split in
    %     % parts, a searchlight run on each part, and the different outputs
    %     % joined together. This gives identical results as running the
    %     % searchlight on the entire dataset
    %     %
    %     % generate a tiny dataset with 6 voxels
    %     ds=cosmo_synthetic_dataset();
    %     %
    %     % define tiny neighborhoods with radii of 1 voxel
    %     % (more typical values are ~3 voxels)
    %     nh=cosmo_spherical_neighborhood(ds,'radius',1,'progress',false);
    %     %
    %     % Use a simple 'measure' that just averages data along the feature
    %     % dimension. (Any other measure can be used as well)
    %     averager=@(x,opt)cosmo_structjoin('samples',mean(x.samples,2));
    %     %
    %     % disable progress reporting for the searchlight
    %     opt=struct();
    %     opt.progress=false;
    %     %
    %     % run the searchlight
    %     result=cosmo_searchlight(ds,nh,averager,opt);
    %     %
    %     % show output from searchlight
    %     cosmo_disp({result.samples; result.fa});
    %     %|| { [ 0.768     0.368        -1      1.45    0.0342     -0.32
    %     %||     0.526      1.77     0.937      1.08      1.07      1.49
    %     %||      0.46     -1.25    -0.152    0.0899     0.795    -0.522
    %     %||      1.23     0.685     0.954     0.606      1.19     0.335
    %     %||     0.914   -0.0442    0.0295     0.664     0.274    -0.221
    %     %||     0.633      1.24     0.797     0.861      1.54      1.02 ]
    %     %||   .nvoxels
    %     %||     [ 3         4         3         3         4         3 ]
    %     %||   .radius
    %     %||     [ 1         1         1         1         1         1 ]
    %     %||   .center_ids
    %     %||     [ 1         2         3         4         5         6 ]
    %     %||   .i
    %     %||     [ 1         2         3         1         2         3 ]
    %     %||   .j
    %     %||     [ 1         1         1         2         2         2 ]
    %     %||   .k
    %     %||     [ 1         1         1         1         1         1 ]   }
    %     %
    %     [nh_splits,masks_split]=cosmo_neighborhood_split(nh);
    %     n_split=numel(nh_splits);
    %     %
    %     % Allocate space for searchlight output from each part
    %     res2_cell=cell(1,n_split);
    %     %
    %     for k=1:n_split
    %         % get the k-th neighborhood
    %         nh_sel=nh_splits{k};
    %         %
    %         % slice the dataset to select only the features indexed by
    %         % the nh_sel neighborhood.
    %         %
    %         % (when using fmri datasets, it is also possible to use
    %         %  cosmo_fmri_dataset with a filename and the mask from masks,
    %         %  as in:
    %         %
    %         %     ds_sel=cosmo_fmri_dataset('data.nii','mask',masks_split{k});
    %         %
    %         %  Such an approach may result in significant reduction of memory
    %         %  usage, if the file format supports loading partial files
    %         %  (currently unzipped NIFTI (.nii), ANALYZE (.hdr) and AFNI)))
    %         ds_sel=cosmo_slice(ds,masks_split{k}.samples,2);
    %         %
    %         % run the searchlight for this split, and store the result
    %         res2_cell{k}=cosmo_searchlight(ds_sel,nh_sel,averager,opt);
    %     end;
    %     %
    %     % join the results along the second (feature) dimension
    %     result2=cosmo_stack(res2_cell,2);
    %     %
    %     % show the results; they are identical to the original output, modulo
    %     % a possible permutation of the features.
    %     cosmo_disp({result2.samples; result2.fa});
    %     %|| { [ 0.768     0.368      1.45    0.0342        -1     -0.32
    %     %||     0.526      1.77      1.08      1.07     0.937      1.49
    %     %||      0.46     -1.25    0.0899     0.795    -0.152    -0.522
    %     %||      1.23     0.685     0.606      1.19     0.954     0.335
    %     %||     0.914   -0.0442     0.664     0.274    0.0295    -0.221
    %     %||     0.633      1.24     0.861      1.54     0.797      1.02 ]
    %     %||   .center_ids
    %     %||     [ 1         2         3         4         1         2 ]
    %     %||   .i
    %     %||     [ 1         2         1         2         3         3 ]
    %     %||   .j
    %     %||     [ 1         1         2         2         1         2 ]
    %     %||   .k
    %     %||     [ 1         1         1         1         1         1 ]
    %     %||   .nvoxels
    %     %||     [ 3         4         3         4         3         3 ]
    %     %||   .radius
    %     %||     [ 1         1         1         1         1         1 ]   }
    %
    %
    % Notes:
    %   - typical use cases of this function are:
    %     * searchlights on fMRI dataset with many samples and features
    %       (i.e. too many to load all of them in memory). In this case, and
    %       if the file format supports it (NIFTI, ANALYZE, AFNI), parts
    %       of the data can be loaded with cosmo_fmri_datset, where the masks
    %       are based on the second output from this function
    %     * parallel searchlights on a cluster using multiple computing nodes.
    %
    % See also: cosmo_searchlight, cosmo_fmri_dataset
    %
    % #   For CoSMoMVPA's copyright information and license terms,   #
    % #   see the COPYING file distributed with CoSMoMVPA.           #

    defaults = struct();
    defaults.divisions = 4;

    opt = cosmo_structjoin(defaults, varargin);
    validate_input(nh, opt);

    % see how big each block is along each dimension
    [first, last, nfeatures] = get_dim_range(nh);

    min_first = min(first, [], 2);
    max_last = max(last, [], 2);
    max_size = max(last - first, [], 2) + 1;
    extent = max_last - min_first + 1;

    % Examples of how max_size, step and extent are related:
    % max_size=1, step=5, extent=25, 5 div: 1-5, 6-10, 11-15, 16-20, 21-25
    % max_size=1, step=4, extent=20, 5 div: 1-4,  5-8,  9-12, 13-16, 17-20
    % max_size=2, step=5, extent=26, 5 div: 1-6, 6-11, 11-16, 16-21, 21-26
    % max_size=2, step=4, extent=21, 5 div: 1-5,  5-9,  9-13, 13-17, 17-21
    % max_size=3, step=4, extent=22, 5 div: 1-6, 5-10,  9-14, 13-18, 17-22

    % first element:                step+max_size-1     elements
    % all other (div-1) elements:   step                elements

    step = ceil((extent + 1 - max_size) / opt.divisions);
    assert(all(step >= 1));

    % for each element in nh.neighbors, assign it to a block
    % indexed by ndim integers (where ndim is the number of dimensions)
    dim_pos = floor(bsxfun(@rdivide, first - 1, step)) + 1;

    % get the indices for each block.
    % idxs{k}=[a_k1, ..., a_kN] means that nh.neighbors{a_k1}, ...,
    % nh.neighbors{a_kN} all refer to neighboring features
    %
    [idxs, unq] = cosmo_index_unique(dim_pos');

    nsplit = size(unq, 1);

    nh_split = cell(nsplit, 1);
    masks = cell(nsplit, 1);

    keep_split_mask = false(nsplit, 1);

    for split_id = 1:nsplit
        msk = false(1, nfeatures);
        idx = idxs{split_id};
        n_nbrs = numel(idx);
        for j = 1:n_nbrs
            nb = nh.neighbors{idx(j)};
            msk(1, nb) = true;
        end

        if ~any(msk)
            % skip empty neighbors
            continue
        end

        keep_split_mask(split_id) = true;

        m_k = struct();
        m_k.samples = msk;
        m_k.a = nh.origin.a;
        m_k.fa = nh.origin.fa;

        nh_k = struct();
        nh_k.origin.a = m_k.a;
        nh_k.origin.fa = cosmo_slice(m_k.fa, msk, 2, 'struct');
        nh_k.a = nh.a;
        nh_k.fa = cosmo_slice(nh.fa, idx, 2, 'struct');

        all2some = zeros(1, nfeatures);
        all2some(msk) = 1:sum(msk);
        neighbors = cell(n_nbrs, 1);
        for j = 1:n_nbrs
            some = all2some(nh.neighbors{idx(j)});
            assert(all(some ~= 0));
            neighbors{j} = some;
        end
        nh_k.neighbors = neighbors;

        nh_split{split_id} = nh_k;
        masks{split_id} = m_k;
    end

    % only keep non-empty masks
    nh_split = nh_split(keep_split_mask);
    masks = masks(keep_split_mask);

function [first, last, nfeatures] = get_dim_range(nh)
    % for each feature dimension, find the minimum and maximum value
    % in each neighborhood
    % If nh has N neighbors and M dimensions, then the output of first and
    % last in M x N.
    fdim = nh.origin.a.fdim;
    labels = fdim.labels;

    n_labels = numel(labels);
    n_neighbors = numel(nh.neighbors);

    fa = nh.origin.fa;

    % pre-store values for each feature attribute
    fa_idxs_cell = cell(n_labels, 1);

    for k = 1:n_labels
        label = labels{k};
        fa_idxs = fa.(label);
        fa_idxs_cell{k} = fa_idxs;
    end

    first = zeros(n_labels, n_neighbors);
    last = zeros(n_labels, n_neighbors);
    nfeatures = 0;

    for j = 1:n_neighbors
        nb = nh.neighbors{j};

        mx_nb = max(nb);
        if mx_nb > nfeatures
            nfeatures = mx_nb;
        end

        for k = 1:n_labels
            % (this is a timing-critical part of the loop)
            nb_values = fa_idxs_cell{k}(nb);

            first(k, j) = min(nb_values);
            last(k, j) = max(nb_values);
        end
    end

function opt = validate_input(nh, opt)
    % some checks on the input
    cosmo_check_neighborhood(nh);

    allowed_fieldnames = {'divisions'};
    delta = setdiff(fieldnames(opt), allowed_fieldnames);
    if ~isempty(delta)
        error('illegal field %s', delta{1});
    end

    divisions = opt.divisions;
    if ~(isnumeric(divisions) && ...
         isscalar(divisions) && ...
         isfinite(divisions) && ...
         divisions > 1)
        error('count must be finite scalar >1');
    end

    origin_labels = {'origin.fa', ...
                     'origin.a.fdim.labels', ...
                     'origin.a.fdim.values'};
    if ~all(cosmo_isfield(nh, origin_labels))
        error('origin has missing labels');
    end

    is_numeric_vector = @(x)isnumeric(x) && isvector(x);
    if ~all(cellfun(is_numeric_vector, nh.a.fdim.values))
        error('elements in .a.fdim.values must be numeric vectors');
    end