function ds_sa = cosmo_correlation_measure(ds, varargin)
% Computes a split-half correlation measure
%
% d=cosmo_correlation_measure(ds[, args])
%
% Inputs:
% ds dataset structure with fields .samples, .sa.targets and
% .sa.chunks
% args optional struct with the following optional fields:
% .partitions struct with fields .train_indices and .test_indices.
% Both should be a Px1 cell for P partitions. If omitted,
% it is set to cosmo_nchoosek_partitioner(ds,'half').
% .template QxQ matrix for Q classes in each chunk. This matrix
% weights the correlations across the two halves.
% If ds.sa.targets has only one unique value, it must be
% set to the scalar value 1; otherwise it should
% have a mean of zero. If omitted, it has positive values
% of (1/Q) on the diagonal and (-1/(Q*(Q-1)) off the
% diagonal.
% (Note: this can be used to test for representational
% similarity matching)
% .merge_func A function handle used to merge data from matching
% targets in the same chunk. Default is @(x) mean(x,1),
% meaning that values are averaged over the same samples.
% It is assumed that isequal(args.merge_func(y),y) if y
% is a row vector.
% .corr_type Type of correlation: 'Pearson','Spearman','Kendall'.
% The default is 'Pearson'.
% .post_corr_func Operation performed after correlation. (default:
% @atanh)
% .output 'mean' (default): correlations weighted by template
% 'raw' or 'correlation': correlations between all classes
% 'one_minus_correlation': 1 minus correlations
% 'mean_by_fold': provide weighted correlations for each
% fold in the partitions.
%
%
% Output:
% ds_sa Struct with fields:
% .samples Scalar indicating how well the template matrix
% correlates with the correlation matrix from the two
% halves (averaged over partitions). By default:
% - this value is based on Fisher-transformed correlation
% values, not raw correlation values
% - this is the average of the (Fisher-transformed)
% on-diagonal minus the average of the
% (Fisher-transformed) off-diagonal elements of the
% correlation matrix based on the two halves of the data.
% .sa Struct with field:
% .labels if output=='corr'
% .half1 } if output=='raw': (N^2)x1 vectors with indices of data
% .half2 } from two halves, with N the number of unique targets.
%
% Example:
% ds=cosmo_synthetic_dataset();
% %
% % compute on-minus-off diagonal correlations
% c=cosmo_correlation_measure(ds);
% cosmo_disp(c)
% %|| .samples
% %|| [ 1.23 ]
% %|| .sa
% %|| .labels
% %|| { 'corr' }
% %
% % Spearman correlation requires the Matlab statistics toolbox
% cosmo_skip_test_if_no_external('@stats');
% %
% c=cosmo_correlation_measure(ds,'corr_type','Spearman');
% cosmo_disp(c)
% %|| .samples
% %|| [ 1.28 ]
% %|| .sa
% %|| .labels
% %|| { 'corr' }
%
% ds=cosmo_synthetic_dataset();
% % get raw correlations without fisher transformation
% c_raw=cosmo_correlation_measure(ds,'output','correlation',...
% 'post_corr_func',[]);
% cosmo_disp(c_raw)
% %|| .samples
% %|| [ 0.363
% %|| -0.404
% %|| -0.447
% %|| 0.606 ]
% %|| .sa
% %|| .half1
% %|| [ 1
% %|| 2
% %|| 1
% %|| 2 ]
% %|| .half2
% %|| [ 1
% %|| 1
% %|| 2
% %|| 2 ]
% %|| .a
% %|| .sdim
% %|| .labels
% %|| { 'half1' 'half2' }
% %|| .values
% %|| { [ 1 [ 1
% %|| 2 ] 2 ] }
% %
% % convert to matrix form (N x N x P, with N the number of classes and
% % P=1)
% matrices=cosmo_unflatten(c_raw,1);
% cosmo_disp(matrices)
% %|| [ 0.363 -0.447
% %|| -0.404 0.606 ]
%
% % compute for each fold separately, using a custom take-one-chunk
% % out partitioning scheme and without Fisher transformation of
% % the correlations.
% % Note that c.sa.chunks in the output reflects the test chunk
% % in each partition
% ds=cosmo_synthetic_dataset('type','fmri','nchunks',4);
% partitions=cosmo_nfold_partitioner(ds);
% c=cosmo_correlation_measure(ds,'output','mean_by_fold',...
% 'partitions',partitions,...
% 'post_corr_func',[]);
% cosmo_disp(c.samples);
% %|| [ 1.32
% %|| 0.512
% %|| 1.05
% %|| 1.23 ]
% cosmo_disp(c.sa);
% %|| .partition
% %|| [ 1
% %|| 2
% %|| 3
% %|| 4 ]
%
% % minimal searchlight example
% ds=cosmo_synthetic_dataset('type','fmri');
% % use searchlight with radius 1 voxel (radius=3 is more typical)
% nbrhood=cosmo_spherical_neighborhood(ds,'radius',1,'progress',false);
% % run searchlight
% res=cosmo_searchlight(ds,nbrhood,@cosmo_correlation_measure,...
% 'progress',false);
% cosmo_disp(res.samples)
% %|| [ 1.87 1.25 1.51 1.68 1.71 0.879 ]
% cosmo_disp(res.sa)
% %|| .labels
% %|| { 'corr' }
%
% Notes:
% - by default the post_corr_func is set to @atanh. This is equivalent to
% a Fisher transformation from r (correlation) to z (standard z-score).
% The underlying math is z=atanh(r)=.5*log((1+r)./log(1-r)).
% The rationale is to make data more normally distributed under the
% null hypothesis.
% Fisher-transformed correlations can be transformed back to
% their original correlation values using 'tanh', which is the inverse
% of 'atanh'.
% - To disable the (by default used) Fisher-transformation, set the
% 'post_corr_func' option to [].
% - if multiple samples are present with the same chunk and target, they
% are averaged *prior* to computing the correlations.
% - if multiple partitions are present, then the correlations are
% computed separately for each partition, and then averaged (unless
% the 'output' option is set, and set to a different value than
% 'mean'.
% - When more than two chunks are present in the input, partitions
% consist of all possible half splits for which the number of unique
% chunks in the train and test set differ by 1 at most.
% For illustration, up to 6 chunks, the
% partitions are:
% - 2 chunks - partition #1
% chunks first half: { 1
% chunks second half: { 2
%
% - 3 chunks - partition #1 #2 #3
% chunks first half: { 3 2 1
% { 1 1 2
% chunks second half: { 2 3 3
%
% - 4 chunks - partition #1 #2 #3
% { 3 2 2
% chunks first half: { 4 4 3
% { 1 1 1
% chunks second half: { 2 3 4
%
% - 5 chunks - partition #1 #2 #3 #4 #5 #6 #7 #8 #9 #10
% { 4 3 3 2 2 2 1 1 1 1
% chunks first half: { 5 5 4 5 4 3 5 4 3 2
% { 1 1 1 1 1 1 2 2 2 3
% chunks second half: { 2 2 2 3 3 4 3 3 4 4
% { 3 4 5 4 5 5 4 5 5 5
%
% - 6 chunks - partition #1 #2 #3 #4 #5 #6 #7 #8 #9 #10
% { 4 3 3 3 2 2 2 2 2 2
% chunks first half: { 5 5 4 4 5 4 4 3 3 3
% { 6 6 6 5 6 6 5 6 5 4
% { 1 1 1 1 1 1 1 1 1 1
% chunks second half: { 2 2 2 2 3 3 3 4 4 5
% { 3 4 5 6 4 5 6 5 6 6
% Thus, with an increasing number of chunks, the number of partitions
% (and thus the time required to run this function) increases
% quadratically. To use simpler partition schemes (e.g. odd-even, as
% provided by cosmo_oddeven_partitioner), specify the 'partitions'
% argument.
%
% References
% - Haxby, J. V. et al (2001). Distributed and overlapping
% representations of faces and objects in ventral temporal cortex.
% Science 293, 2425?2430
%
% # For CoSMoMVPA's copyright information and license terms, #
% # see the COPYING file distributed with CoSMoMVPA. #
persistent cached_partitions
persistent cached_chunks
persistent cached_params
persistent cached_varargin
% optimize parameters parsing: if same arguments were used in
% a previous call, do not use cosmo_structjoin (which is relative
% expensive)
if ~isempty(cached_params) && isequal(cached_varargin, varargin)
params = cached_params;
else
defaults = struct();
defaults.partitions = [];
defaults.template = [];
defaults.merge_func = [];
defaults.corr_type = 'Pearson';
defaults.post_corr_func = @atanh;
defaults.output = 'mean';
defaults.check_partitions = true;
params = cosmo_structjoin(defaults, varargin);
show_warning_if_no_defaults(defaults, params);
cached_params = params;
cached_varargin = varargin;
end
partitions = params.partitions;
template = params.template;
merge_func = params.merge_func;
post_corr_func = params.post_corr_func;
check_partitions = params.check_partitions;
chunks = ds.sa.chunks;
if isempty(partitions)
if ~isempty(cached_chunks) && isequal(cached_chunks, chunks)
partitions = cached_partitions;
% assume that partitions were already checked
check_partitions = false;
else
partitions = cosmo_nchoosek_partitioner(ds, 'half');
cached_chunks = chunks;
cached_partitions = partitions;
end
end
if check_partitions
cosmo_check_partitions(partitions, ds, params);
end
targets = ds.sa.targets;
nsamples = size(targets, 1);
[classes, unused, class_ids] = fast_unique(targets);
nclasses = numel(classes);
switch nclasses
case 0
error('No classes found - this is not supported');
case 1
if ~isequal(template, 1)
error(['Only one unique value for .sa.targets was found; '...
'this is only allowed when the ''template'' '...
'parameter is explicitly set to 1.\n'...
'Note that this option does not compute the '...
'correlation *difference* between matching and '...
'non-matching values in .sa.targets; instead it '...
'computes the direct correlation '...
'between two halves of the data. A typical use '...
'case is when .samples already contains a '...
'difference score comparing two different '...
'conditions']);
end
otherwise
if isempty(template)
template = (eye(nclasses) - 1 / nclasses) / (nclasses - 1);
else
max_tolerance = 1e-8;
if abs(sum(template(:))) > max_tolerance
error('Template matrix does not have a sum of zero');
end
expected_size = [nclasses, nclasses];
if ~isequal(size(template), [nclasses, nclasses])
error('Template must be of size %dx%d', expected_size);
end
end
end
template_msk = isfinite(template);
npartitions = numel(partitions.train_indices);
% space for output
pdata = cell(npartitions, 1);
% keep track of how often each chunk was used in test_indices,
% and which chunk was used last
test_chunks_last = NaN(nsamples, 1);
test_chunks_count = zeros(nsamples, 1);
for k = 1:npartitions
train_indices = partitions.train_indices{k};
test_indices = partitions.test_indices{k};
% get data in each half
half1 = get_data(ds, train_indices, class_ids, merge_func);
half2 = get_data(ds, test_indices, class_ids, merge_func);
% compute raw correlations
raw_c = cosmo_corr(half1', half2', params.corr_type);
% apply post-processing (usually Fisher-transform, i.e. atanh)
c = apply_post_corr_func(post_corr_func, raw_c);
% aggregate results
pdata{k} = aggregate_correlations(c, template, template_msk, params.output);
test_chunks_last(test_indices) = chunks(test_indices);
test_chunks_count(test_indices) = test_chunks_count(test_indices) + 1;
end
ds_sa = struct();
switch params.output
case 'mean'
ds_sa.samples = mean(cat(2, pdata{:}), 2);
ds_sa.sa.labels = {'corr'};
case {'raw', 'correlation'}
ds_sa.samples = mean(cat(2, pdata{:}), 2);
nclasses = numel(classes);
ds_sa.sa.half1 = reshape(repmat((1:nclasses)', nclasses, 1), [], 1);
ds_sa.sa.half2 = reshape(repmat(1:nclasses, nclasses, 1), [], 1);
ds_sa.a.sdim = struct();
ds_sa.a.sdim.labels = {'half1', 'half2'};
ds_sa.a.sdim.values = {classes, classes};
case 'mean_by_fold'
ds_sa.sa.partition = (1:npartitions)';
ds_sa.samples = [pdata{:}]';
otherwise
assert(false, 'this should be caught by get_data');
end
function c = apply_post_corr_func(post_corr_func, c)
if ~isempty(post_corr_func)
c = post_corr_func(c);
end
function agg_c = aggregate_correlations(c, template, template_msk, output)
switch output
case {'mean', 'mean_by_fold'}
pcw = c(template_msk) .* template(template_msk);
agg_c = sum(pcw(:));
case {'raw', 'correlation'}
agg_c = c(:);
case 'one_minus_correlation'
error(['the ''output'' option ''one_minus_correlation'' '...
'has been removed. Please contact CoSMoMVPA''s '...
'authors if you really need this option']);
otherwise
error('Unsupported output %s', output);
end
function [unq, pos, ids] = fast_unique(x)
% optimized for vectors
[y, i] = sort(x, 1);
msk = [true; diff(y) > 0];
j = find(msk);
pos = i(j);
unq = y(j);
vs = cumsum(msk);
ids = vs;
ids(i) = vs;
function data = get_data(ds, sample_idxs, class_ids, merge_func)
samples = ds.samples(sample_idxs, :);
target_ids = class_ids(sample_idxs);
nclasses = max(class_ids);
merge_by_averaging = isempty(merge_func);
if isequal(target_ids', 1:nclasses) && merge_by_averaging
% optimize standard case of one sample per class and normal
% averaging over samples
data = samples;
return
end
nfeatures = size(samples, 2);
data = zeros(nclasses, nfeatures);
for k = 1:nclasses
msk = target_ids == k;
n = sum(msk);
class_samples = samples(msk, :);
if merge_by_averaging
if n == 1
data(k, :) = class_samples;
else
data(k, :) = sum(class_samples, 1) / n;
end
else
data(k, :) = merge_func(class_samples);
end
if n == 0
error('missing target class %d', class_ids(k));
end
end
function show_warning_if_no_defaults(defaults, params)
keys = {'output', 'post_corr_func'};
has_defaults = isequal(select_fields(defaults, keys), ...
select_fields(params, keys));
if ~has_defaults && isequal(params.post_corr_func, @atanh)
name = mfilename();
msg = sprintf( ...
['Please note that the ''%s'' function applies '...
'Fisher transformation after the correlations '...
'have been computed. This was a somewhat unfortunate '...
'implementation decision that will not be changed '...
'to avoid breaking behaviour with earlier versions.\n'...
'To disable using the Fisher transformation, use the '...
'''%s'' function while setting the ''post_corr_func'''...
'option to the empty array ([])'], name, name);
cosmo_warning(msg);
end
function subset = select_fields(s, keys)
n = numel(keys);
for k = 1:n
key = keys{k};
subset.(key) = s.(key);
end