function is_ok=cosmo_check_dataset(ds, ds_type, error_if_not_ok)
% Check consistency of a dataset.
%
%
% is_ok=cosmo_dataset_check(ds, [ds_type,][,error_if_not_ok])
%
% Inputs:
% ds dataset struct.
% ds_type string indicating the specific type of dataset.
% Currently supports 'fmri' and 'meeg'.
% error_if_not_ok if true (the default) or a string, an error is
% raised if the dataset is not kosher (see below).
% If a string, then it is prefixed in the error
% message. If false, then no error is raised.
%
% Returns:
% is_ok boolean indicating kosherness of ds.
% It is consider ok if:
% - it has a field .samples with a PxQ array.
% - if it has a field .features [.samples], then
% it should be a struct, and each field in it
% should have P [Q] elements along the first
% [second] dimension or be empty.
% - .sa.{targets,chunks} are numeric vectors with
% integers (if present)
% - if ds_type is provided, then some more tests
% (depending on ds_type) are performed.
%
% Examples:
% cosmo_check_dataset([])
% %|| error('dataset not a struct')
%
% cosmo_check_dataset(struct())
% %|| error('dataset has no field .samples')
%
% % this (very minimal) dataset is kosher
% cosmo_check_dataset(struct('samples',zeros(2)))
% %|| true
%
% % error can be silenced
% cosmo_check_dataset('this is not ok',false)
% %|| false
%
% % run some more tests
% ds=cosmo_synthetic_dataset('type','fmri');
% cosmo_check_dataset(ds)
% %|| true
% ds.sa.chunks=[2;3]; % wrong size
% cosmo_check_dataset(ds)
% %|| error('sa.chunks has 2 values in dimension 1, expected 6')
% ds.sa.chunks={'a','b','c','a','b','c'}';
% cosmo_check_dataset(ds)
% %|| error('.sa.chunks must be numeric vector with integers')
%
% % set illegal dimension values
% ds=cosmo_synthetic_dataset('type','fmri');
% ds.a.fdim.values{1}=[1 2];
% cosmo_check_dataset(ds)
% %|| error('.fa.i must be vector with integers in range 1..2')
%
% % check for specific type of dataset
% ds=cosmo_synthetic_dataset('type','fmri');
% cosmo_check_dataset(ds,'meeg')
% %|| error('missing field .a.meeg for meeg-dataset');
%
% % destroy crucial information in fmri dataset
% % this error is only caught if explicit checking for fmri dataset is
% % enabled, because the dataset remains valid when considered as a
% % non-fmri dataset
% ds=cosmo_synthetic_dataset('type','fmri');
% % destroy volume information
% ds.a=rmfield(ds.a,'vol');
% cosmo_check_dataset(ds)
% %|| true % error not caught
% cosmo_check_dataset(ds,'fmri')
% %|| error('missing field .a.vol for fmri-dataset')
%
% % check meeg dataset
% ds=cosmo_synthetic_dataset('type','meeg');
% cosmo_check_dataset(ds,'meeg')
% %|| true
% ds.fa.chan=ds.fa.chan+6; % outside range
% cosmo_check_dataset(ds)
% %|| error('.fa.chan must be vector with integers in range 1..3')
%
% Notes:
% - if the second argument is a boolean then its value is used for
% error_if_not_ok, and ds_type is not used
% - this function throws one error at most, even if it is inconsistent for
% several reasons.
% - it is good practice to use this function when a new dataset is created
% to ensure consistency of the data
%
% # For CoSMoMVPA's copyright information and license terms, #
% # see the COPYING file distributed with CoSMoMVPA. #
% deal with input arguments
if nargin<3
error_if_not_ok=true;
end
if nargin>=2
if islogical(ds_type)
error_if_not_ok=ds_type;
ds_type=[];
end
else
ds_type=[];
error_if_not_ok=true;
end
if ischar(error_if_not_ok)
error_prefix=error_if_not_ok;
error_if_not_ok=true;
else
error_prefix='';
end
% list check functions
checkers={@check_fields,...
@check_samples,...
@check_targets,...
@check_chunks,...
@check_attributes,...
@check_dim_legacy,...
@check_dim,...
[]}; % space for check_with_type
if ~isempty(ds_type)
% add checker for specific type (fmri, meeg, surface)
checkers{end}=@(x) check_with_type(x,ds_type);
end
msg=run_checkers(checkers,ds);
is_ok=isempty(msg);
if ~is_ok && error_if_not_ok
error('%s: %s', error_prefix, msg);
end
function msg=run_checkers(checkers,ds)
n=numel(checkers);
msg='';
for k=1:n
checker=checkers{k};
if isempty(checker)
continue;
end
msg=checker(ds);
if ~isempty(msg)
return
end
end
function msg=check_with_type(ds, ds_type)
% additional checks for fmri, surface or meeg dataset
% note: check_dim should have already checked that
% all fields are present in .fa
msg='';
switch ds_type
case 'fmri'
required_dim_labels={'i','j','k'};
a_fields={'vol'};
case 'surface'
required_dim_labels={'node_indices'};
a_fields={};
case 'meeg'
required_dim_labels={};
a_fields={'meeg'};
otherwise
error('Unsupported ds_type=%s', ds_type);
end
% check present of .a.fdim field
if ~cosmo_isfield(ds, 'a.fdim', false);
msg='missing field .a.fdim';
return;
end
m=cosmo_match(required_dim_labels,ds.a.fdim.labels);
if any(~m)
i=find(~m,1);
msg=sprintf('missing value %s in .a.fdim.values for %s-dataset',...
required_dim_labels{i}, ds_type);
return
end
a_fns=fieldnames(ds.a);
m=cosmo_match(a_fields,a_fns);
if any(~m)
i=find(~m,1);
msg=sprintf('missing field .a.%s for %s-dataset',...
a_fields{i}, ds_type);
return
end
function tf=is_int_vector(x)
tf=isnumeric(x) && isvector(x) && all(round(x)==x | isnan(x));
function msg=check_dim_legacy(ds)
msg='';
if cosmo_isfield(ds,'a.dim')
msg=sprintf(['***CoSMoMVPA legacy***\n'...
'Feature dimension information is now stored '...
'in .a.fdim, whereas earlier versions used .a.dim. '...
'To adapt a existing dataset struct ''ds'', run:\n'...
' ds.a.fdim=ds.a.dim;\n'...
' ds.a=rmfield(ds.a,''dim'')\n']);
return;
end
function msg=check_fields(ds)
msg='';
if ~isstruct(ds)
msg='input must be a struct';
return;
end
delta=setdiff(fieldnames(ds),{'samples','fa','sa','a'});
if ~isempty(delta)
msg=sprintf('illegal field .%s', delta{1});
return
end
function msg=check_targets(ds)
msg='';
if cosmo_isfield(ds,'sa.targets') && ~is_int_vector(ds.sa.targets)
msg=['.sa.targets must be numeric vector with integers '...
'(.sa.labels can be used to store string labels)'];
end
function msg=check_chunks(ds)
msg='';
if cosmo_isfield(ds,'sa.chunks') && ~isnumeric(ds.sa.chunks)
msg='.sa.chunks must be numeric vector with integers';
end
function msg=check_samples(ds)
msg='';
if ~isfield(ds,'samples')
msg='dataset has no field .samples';
return
end
% has samples, so check the rest
ds_size=size(ds.samples);
if numel(ds_size) ~=2,
msg=sprintf('.samples should be 2D, found %dD', numel(ds_size));
return
end
function msg=check_attributes(ds)
msg='';
attrs_fns={'sa','fa'};
ds_size=size(ds.samples);
% check sample and feature attributes
for dim=1:2
attrs_fn=attrs_fns{dim};
if isfield(ds, attrs_fn);
% get feature/sample attributes
attrs=ds.(attrs_fn);
fns=fieldnames(attrs);
n=numel(fns);
% check each one
for j=1:n
fn=fns{j};
attr=attrs.(fn);
if isempty(attr)
continue;
end
attr_size=size(attr);
if numel(attr_size) ~= 2
msg=sprintf('%s.%s should be 2D', attrs_fn, fn);
return
end
if attr_size(dim) ~= ds_size(dim)
msg=sprintf(['%s.%s has %d values in dimension '...
'%d, expected %d'], attrs_fn, fn,...
attr_size(dim), dim, ds_size(dim));
if attr_size(3-dim) == ds_size(dim)
msg=[msg ' (maybe the data was intended '...
'to be transposed?)'];
end
return
end
end
end
end
function msg=check_dim(ds)
% helper function to check dataset with dimensions
% (i.e., .a.{s,f}dim is present)
msg='';
suffixes='sf';
for dim=1:2
suffix=suffixes(dim);
dim_attrs_str=sprintf('a.%sdim',suffix);
if ~cosmo_isfield(ds,dim_attrs_str)
continue;
end
attrs_str=[suffix 'a'];
if ~isfield(ds, attrs_str)
msg=sprintf('Missing field .%s',attrs_str);
return
end
attrs=ds.(attrs_str);
dim_attrs=ds.a.([suffix 'dim']);
msg=check_dim_helper(attrs, dim_attrs, attrs_str, dim_attrs_str);
if ~isempty(msg)
return
end
end
function msg=check_dim_helper(attrs, dim_attrs, attrs_str, dim_attrs_str)
msg='';
% attrs is from .sa or .fa; dim_attrs from .a.sdim or .a.fdim
% the *_str arguments contain a string representation
if ~all(cosmo_isfield(dim_attrs,{'labels','values'}))
msg=sprintf('Missing field .%s.{labels,values}',dim_attrs_str);
return;
end
labels=dim_attrs.labels;
values=dim_attrs.values;
if ~iscellstr(labels)
msg=sprintf('.%s.labels must be a cell', dim_attrs_str);
return
end
if ~iscell(values)
msg=sprintf('.%s.values must be a cell', dim_attrs_str);
return
end
ndim=numel(labels);
if numel(values)~=ndim
msg=sprintf('size mismatch between .%s.labels and .%s.values',...
dim_attrs_str,dim_attrs_str);
return
end
for dim=1:ndim
label=labels{dim};
if ~isfield(attrs, label)
msg=sprintf('Missing field .%s.%s', attrs_str, label);
return
end
v=attrs.(label);
% empty vectors are allowed (in empty datasets)
if isempty(v)
continue
end
vmax=numel(values{dim});
all_int=is_int_vector(v);
if ~all_int || min(v)<1 || max(v)>vmax
msg=sprintf(['.%s.%s must be vector with integers in '...
'range 1..%d'],attrs_str,label,vmax);
if all_int && min(v)==0
% could be mistaken base-0 indexing
msg=sprintf(['%s\nThe lowest index is 0, which may '...
'indicate base-0 indexing (the first '...
'element is indexed by 0). Note that '...
'Matlab (and CoSMoMVPA) use base-1 '...
'indexing.\n'...
'- Manual conversion from base-0 to '...
'base-1 can be achieved by increasing '...
'the values in .%s.%s by 1.\n'...
'- If this dataset was exported from '...
'PyMVPA and contains fMRI volumetric '...
'or surface-based data, consider using '...
'cosmo_fmri_dataset or '...
'cosmo_surface_dataset (respectively) '...
'to convert PyMVPA''s base-0 indexing to '...
'CoSMoMVPA''s base-1 indexing'],...
msg,attrs_str,label);
end
return
end
end