function res=cosmo_distatis(ds, varargin)
% apply DISTATIS measure to each feature
%
% res=cosmo_statis_measure(ds, opt)
%
% Inputs:
% ds dataset struct with dissimilarity values; usually
% the output from @cosmo_dissimilarity_matrix_measure
% applied to each subject followed by cosmo_stack. It
% can also be a cell with datasets (one per subject).
% 'return', d d can be 'distance' (default) or 'crossproduct'.
% 'distance' returns a distance matrix, whereas
% 'crossproduct' returns a crossproduct matrix
% 'split_by', s sample attribute that discriminates chunks
% (participants) (default: 'chunks')
% 'shape', sh shape of output if it were unflattened using
% cosmo_unflatten, either 'square' (default) or
% 'triangle' (which gives the lower diagonal of the
% distance matrix)
%
% Returns:
% res result dataset struct with feature-wise optimal
% compromise distance matrix across subjects
% .samples
%
%
% Example:
% % (This example cannot be documentation tested using Octave,
% % since Octave does not allow for-loops with evalc)
% cosmo_skip_test_if_no_external('matlab');
% %
% ds=cosmo_synthetic_dataset('nsubjects',5,'nchunks',1,'ntargets',4);
% %
% % define neighborhood (here a searchlight with radius of 1 voxel)
% nbrhood=cosmo_spherical_neighborhood(ds,'radius',1,'progress',false);
% %
% % define measure
% measure=@cosmo_dissimilarity_matrix_measure;
% % each subject is a chunk
% ds.sa.chunks=ds.sa.subject;
% % compute DSM for each subject
% sp=cosmo_split(ds,'chunks');
% for k=1:numel(sp)
% sp{k}=cosmo_searchlight(sp{k},nbrhood,measure,'progress',false);
% sp{k}.sa.chunks=ones(6,1)*k;
% end
% % merge results
% dsms=cosmo_stack(sp);
% %
% r=cosmo_distatis(dsms,'return','distance','progress',false);
% cosmo_disp(r);
% %|| .samples
% %|| [ 0 0 0 0 0 0
% %|| 0.818 1.09 0.77 0.653 1.03 0.421
% %|| 0.869 1.3 1.06 1.04 0.932 1.07
% %|| : : : : : :
% %|| 1.16 0.889 0.99 0.631 1.48 0.621
% %|| 0.268 0.952 0.965 0.462 0.943 1.04
% %|| 0 0 0 0 0 0 ]@16x6
% %|| .fa
% %|| .center_ids
% %|| [ 1 2 3 4 5 6 ]
% %|| .i
% %|| [ 1 2 3 1 2 3 ]
% %|| .j
% %|| [ 1 1 1 2 2 2 ]
% %|| .k
% %|| [ 1 1 1 1 1 1 ]
% %|| .nvoxels
% %|| [ 3 4 3 3 4 3 ]
% %|| .radius
% %|| [ 1 1 1 1 1 1 ]
% %|| .quality
% %|| [ 0.685 0.742 0.617 0.648 0.757 0.591 ]
% %|| .nchunks
% %|| [ 5 5 5 5 5 5 ]
% %|| .a
% %|| .fdim
% %|| .labels
% %|| { 'i' 'j' 'k' }
% %|| .values
% %|| { [ 1 2 3 ] [ 1 2 ] [ 1 ] }
% %|| .sdim
% %|| .labels
% %|| { 'targets1' 'targets2' }
% %|| .values
% %|| { [ 1 [ 1
% %|| 2 2
% %|| 3 3
% %|| 4 ] 4 ] }
% %|| .vol
% %|| .mat
% %|| [ 2 0 0 -3
% %|| 0 2 0 -3
% %|| 0 0 2 -3
% %|| 0 0 0 1 ]
% %|| .dim
% %|| [ 3 2 1 ]
% %|| .xform
% %|| 'scanner_anat'
% %|| .sa
% %|| .targets1
% %|| [ 1
% %|| 2
% %|| 3
% %|| :
% %|| 2
% %|| 3
% %|| 4 ]@16x1
% %|| .targets2
% %|| [ 1
% %|| 1
% %|| 1
% %|| :
% %|| 4
% %|| 4
% %|| 4 ]@16x1
%
% Reference:
% - Abdi, H., Valentin, D., O?Toole, A. J., & Edelman, B. (2005).
% DISTATIS: The analysis of multiple distance matrices. In
% Proceedings of the IEEE Computer Society: International conference
% on computer vision and pattern recognition, San Diego, CA, USA
% (pp. 42?47).
%
% Notes:
% - DISTATIS tries to find an optimal compromise distance matrix across
% the different samples (participants)
% - Output can be reshape to matrix or array form using
% cosmo_unflatten(res,1)
%
% # For CoSMoMVPA's copyright information and license terms, #
% # see the COPYING file distributed with CoSMoMVPA. #
cosmo_check_external('distatis');
defaults.return='distance';
defaults.split_by='chunks';
defaults.shape='square';
defaults.mask_output=[];
defaults.progress=100;
defaults.feature_ids=[];
defaults.autoscale=true;
defaults.abs_correlation=false;
defaults.weights='eig';
opt=cosmo_structjoin(defaults,varargin);
subject_cell=get_subject_data(ds,opt);
nsubj=numel(subject_cell);
[dsms,nclasses,dim_labels,dim_values]=get_dsms(subject_cell);
feature_ids=get_feature_ids(size(dsms{1},3),opt);
nfeatures=numel(feature_ids);
quality=zeros(1,nfeatures);
nobservations=zeros(1,nfeatures);
prev_msg='';
clock_start=clock();
show_progress=nfeatures>1 && opt.progress;
for k=1:nfeatures
feature_id=feature_ids(k);
x=zeros(nclasses*nclasses,nsubj);
for j=1:nsubj
dsm=dsms{j}(:,:,feature_id);
x(:,j)=distance2crossproduct(dsm, opt.autoscale);
end
[x,subj_msk]=cosmo_remove_useless_data(x);
nkeep=sum(subj_msk);
% equivalent, but slower:
% [e,v]=eigs(c,1);
[ew,v]=get_weights(x, feature_id, nkeep, opt);
% compute compromise
compromise=x*ew;
result=convert_compromise(compromise, opt);
if feature_id==1
% allocate space
samples=zeros(numel(result),nfeatures);
end
samples(:,k)=result;
quality(:,k)=v/nkeep;
nobservations(:,k)=nkeep;
if show_progress && (k<10 || ...
mod(k, opt.progress)==0 || ...
k==nfeatures)
status=sprintf('quality=%.3f%% (avg)',mean(quality(1:k)));
prev_msg=cosmo_show_progress(clock_start,k/nfeatures,...
status,prev_msg);
end
end
% set output in either triangular or square shape
[res,i,j]=get_samples_in_shape(samples,nclasses,opt.shape);
res=copy_fields(ds,res,{'fa','a'});
% add attributes
res.fa.quality=quality;
res.fa.nchunks=nobservations;
res.a.sdim=struct();
res.a.sdim.labels=dim_labels;
res.a.sdim.values=dim_values;
res.sa.(dim_labels{1})=i(:);
res.sa.(dim_labels{2})=j(:);
cosmo_check_dataset(res);
function [res,i,j]=get_samples_in_shape(samples,nclasses,shape)
res=struct();
switch shape
case 'triangle'
[msk,i,j]=distance_matrix_mask(nclasses);
res.samples=samples(msk(:),:);
case 'square'
res.samples=samples;
[i,j]=find(ones(nclasses));
otherwise
error('unsupported direction %s', shape);
end
function dst=copy_fields(src,dst,keys)
for k=1:numel(keys)
key=keys{k};
if isfield(src,key)
dst.(key)=src.(key);
end
end
function feature_ids=get_feature_ids(nfeatures, opt)
feature_ids=opt.feature_ids;
if isempty(feature_ids);
feature_ids=1:nfeatures;
end
function [ew,v]=get_weights(x, feature_id, nkeep, opt)
switch opt.weights
case 'eig'
[ew,v]=eigen_weights(x, feature_id);
case 'uniform'
% all the same (allowing for comparison with 'eig')
ew=ones(nkeep,1)/nkeep;
v=0;
otherwise
error('illegal weight %s', opt.weights);
end
function subject_cell=get_subject_data(ds,opt)
if isstruct(ds)
subject_cell=cosmo_split(ds,opt.split_by);
else
subject_cell=ds;
end
if numel(subject_cell)==0
error('empty input');
end
function [ew,v]=eigen_weights(x, feature_id)
c=cosmo_corr(x);
negative_c=c<0;
if any(negative_c(:))
[i,j]=find(negative_c);
error(['feature %d has negative correlation between '...
'sample %d and %d, which is not supported by '...
'distatis. DISTATIS assumes that the similarity '...
'data from all samples (typically: participants) '...
'correlate positively. Because that is not the '...
'case, you cannot use DISTATIS analysis on this '...
'data. '],...
feature_id,i(1),j(1));
end
[v,e]=fast_eig1(c);
if all(e<0)
e=-e;
end
assert(all(e>0));
assert(v>0);
% normalize first eigenvector
ew=e/sum(e);
function result=convert_compromise(compromise, opt)
switch opt.return
case 'crossproduct'
result=compromise;
case 'distance'
result=crossproduct2distance(compromise);
otherwise
error('illegal opt.return');
end
function z=crossproduct2distance(x)
n=sqrt(numel(x));
e=ones(n,1);
d=x(1:(n+1):end);
dd=d*e';
ddt=dd';
y=dd(:)+ddt(:)-2*x;
z=ensure_distance_vector(y);
function assert_symmetric(x, tolerance)
if nargin<2, tolerance=1e-8; end
% assert x is a square matrix
sz=size(x);
assert(isequal(sz,sz([2 1])));
xx=x'-x;
msk=xx>tolerance;
if any(msk)
[i,j]=find(msk,1);
error('not symmetric: x(%d,%d)=%d ~= %d=x(%d,%d)',...
i,j,x(i,j),x(j,i),j,i);
end
function z_vec=distance2crossproduct(x, autoscale)
n=size(x,1);
e=ones(n,1);
m=e*(1/n);
ee=eye(n)-e*m';
y=-.5*ee*(x+x')*ee';
if autoscale
z=(1/fast_eig1(y))*y;
else
z=y;
end
assert_symmetric(z);
% equivalent, but slower:
% z=(1/eigs(y,1))*y(:);
z_vec=z(:);
function [lambda,pivot]=fast_eig1(x)
% returns the first eigenvalue in lambda, and the corresponding
% eigenvector in pivot
if cosmo_wtf('is_matlab')
[pivot,lambda]=eigs(x,1);
else
% There seems a bug in Octave for 'eigs',
% so use 'eig' instead.
% http://savannah.gnu.org/bugs/?44004
[e,v]=eig(x);
diag_v=diag(v);
% find largest eigenvalue and eigenvector
[lambda,i]=max(diag_v);
pivot=e(:,i);
end
% The code below is disabled because under certain circumstances
% it would return a near-zero eigenvalue if indeed one eigenvalue (but
% not the largest one) is zero.
% % compute first (largest) eigenvalue and corresponding eigenvector
% % using power iteration method; benchmarking suggests this can be up to
% % five times as fast as using eigs(x,1)
% n=size(x,1);
% pivot=ones(n,1);
% tolerance=1e-8;
% max_iter=1000;
%
% old_lambda=NaN;
% for k=1:max_iter
% z=x*pivot;
% pivot=z / norm(z);
%
% lambda=pivot'*z;
% if abs(lambda-old_lambda)/lambda<tolerance
% z=x*pivot;
% pivot=z / sqrt(sum(z.^2));
%
% lambda=pivot'*z;
% return
% end
% old_lambda=lambda;
% end
%
% % matlab fallback
% [pivot,lambda]=eigs(x,1);
function y=ensure_distance_vector(x)
tolerance=1e-8;
n=sqrt(numel(x));
xsq=reshape(x,n,n);
dx=diag(xsq);
assert(all(dx<tolerance));
xsq=xsq-diag(dx);
delta=xsq-xsq';
assert(all(delta(:)<tolerance));
xsq=.5*(xsq+xsq');
y=xsq(:);
function [dsms,nclasses,dim_labels,dim_values]=get_dsms(data_cell)
nsubj=numel(data_cell);
% allocate
dsms=cell(nsubj,1);
for k=1:numel(data_cell)
data=data_cell{k};
% get data
[dsm,dim_labels,dim_values,is_ds]=get_dsm(data);
% store data
dsms{k}=dsm;
if k==1
nclasses=size(dsm,1);
first_dim_labels=dim_labels;
first_dim_values=dim_values;
data_first=data;
else
if ~isequal(first_dim_labels,dim_labels)
error('dim label mismatch between subject 1 and %d',k);
end
if ~isequal(first_dim_values,dim_values)
error('dim label mismatch between subject 1 and %d',k);
end
% check for compatibility over subjects, raises an error if not
% kosher
if is_ds
cosmo_stack({cosmo_slice(data,1),...
cosmo_slice(data_first,1)},1,'unique');
end
end
end
function [msk,i,j]=distance_matrix_mask(nclasses)
msk=triu(repmat(1:nclasses,nclasses,1),1)'>0;
[i,j]=find(msk);
function [dsm, dim_labels, dim_values, is_ds]=get_dsm(data)
is_ds=isstruct(data);
if is_ds
[dsm,dim_labels,dim_values]=cosmo_unflatten(data,1);
elseif isnumeric(data)
sz=size(data);
if numel(sz)~=2
error('only vectorized distance matrices are supported');
end
[n,nfeatures]=size(data);
side=(1+sqrt(1+8*n))/2; % so that side*(side-1)/2==n
if ~isequal(side, round(side))
error(['size %d of input vector is not correct for '...
'the number of elements below the diagonal of a '...
'square (distance) matrix'], n);
end
[msk,i,j]=distance_matrix_mask(side);
dsm=zeros([side,side,nfeatures]);
assert(numel(i)==n);
for pos=1:n
dsm(i(pos),j(pos),:)=data(pos,:);
end
sq1=cosmo_squareform(data(:,1));
dsm1=dsm(:,:,1);
assert(isequal(sq1,dsm1+dsm1'));
dim_labels={'targets1','targets2'};
dim_values={(1:side)',(1:side)'};
else
error('illegal input: expect dataset struct, or cell with arrays');
end