function ds = cosmo_synthetic_dataset(varargin)
% generate synthetic dataset
%
% ds=cosmo_synthetic_dataset(varargin)
%
% Inputs:
% 'size', s determines the number of features. One of
% 'tiny', 'small', 'normal', 'big', or 'huge'
% 'type', t type of dataset. One of 'fmri', 'meeg',
% 'timelock', 'timefreq', 'surface', or 'source'.
% 'meeg' is equivalent to 'timelock'
% 'sens', c for meeg datasets ('time{lock}, 'meeg'), this
% sets which sensor type is used. Supported are
% - neuromag306{all,planar,combined,mag}
% - ctf151
% - 4d{1,2}48[planar]
% - yokogawa{64,160}[planar]
% - eeg10{05,10,20}
% 'data_field', f If the type t is 'source', this can be 'pow' or
% 'mom'.
% 'sigma', sigma parameter to influence class distance. The
% larger this value, the more discrimable the
% classes are.
%
% 'ntargets', nt number of unique targets (default: 2)
% 'nchunks', nc number of unique chunks (default: 3)
%
% 'nmodalities', nm number of unique modalities (default: [])
% 'nsubjects', ns number of unique subjects (default: [])
% 'nreps', nr number of sample repeats (default: [])
% 'seed', seed seed value for pseudo-random number generator
% (default: 1). Different seed values give
% different random numbers.
%
% Output:
% ds dataset struct according to parameters.
% The output has (nt*nc*nm*ns*nr) samples, and
% the number of features is determined by s.
% If any of the nt, nc, nm, ns, nr values are not
% empty, there is a corresponding sample
% attribute.
%
% Examples:
% ds=cosmo_synthetic_dataset();
% cosmo_disp(ds);
% %|| .samples
% %|| [ 2.03 -0.892 -0.826 1.16 1.16 -1.29
% %|| 0.584 1.84 1.17 -0.848 3.49 -0.199
% %|| -1.44 -0.262 -1.92 3.09 -1.37 1.73
% %|| -0.518 2.34 0.441 1.86 0.479 0.0832
% %|| 1.19 -0.204 -0.209 1.76 -0.955 0.501
% %|| -1.33 2.72 0.148 0.502 3.41 -0.48 ]
% %|| .fa
% %|| .i
% %|| [ 1 2 3 1 2 3 ]
% %|| .j
% %|| [ 1 1 1 2 2 2 ]
% %|| .k
% %|| [ 1 1 1 1 1 1 ]
% %|| .a
% %|| .fdim
% %|| .labels
% %|| { 'i' 'j' 'k' }
% %|| .values
% %|| { [ 1 2 3 ] [ 1 2 ] [ 1 ] }
% %|| .vol
% %|| .mat
% %|| [ 2 0 0 -3
% %|| 0 2 0 -3
% %|| 0 0 2 -3
% %|| 0 0 0 1 ]
% %|| .dim
% %|| [ 3 2 1 ]
% %|| .xform
% %|| 'scanner_anat'
% %|| .sa
% %|| .targets
% %|| [ 1
% %|| 2
% %|| 1
% %|| 2
% %|| 1
% %|| 2 ]
% %|| .chunks
% %|| [ 1
% %|| 1
% %|| 2
% %|| 2
% %|| 3
% %|| 3 ]
%
% ds=cosmo_synthetic_dataset('sigma',5,...
% 'nchunks',5,'ntargets',4,...
% 'nsubjects',10);
% cosmo_disp(ds)
% %|| .samples
% %|| [ 3.53 1.65 1.74 0.453 1.22 3.28
% %|| 0.584 3.88 0.856 0.841 -0.899 1.23
% %|| -3.68 1.38 1.89 -1.4 0.216 0.68
% %|| : : : : : :
% %|| -0.725 4.41 -0.313 0.111 0.437 -1.03
% %|| 0.297 0.0279 1.8 -0.0303 -0.451 0.346
% %|| 1.64 0.103 -1.22 4.8 0.754 1.03 ]@200x6
% %|| .fa
% %|| .i
% %|| [ 1 2 3 1 2 3 ]
% %|| .j
% %|| [ 1 1 1 2 2 2 ]
% %|| .k
% %|| [ 1 1 1 1 1 1 ]
% %|| .a
% %|| .fdim
% %|| .labels
% %|| { 'i' 'j' 'k' }
% %|| .values
% %|| { [ 1 2 3 ] [ 1 2 ] [ 1 ] }
% %|| .vol
% %|| .mat
% %|| [ 2 0 0 -3
% %|| 0 2 0 -3
% %|| 0 0 2 -3
% %|| 0 0 0 1 ]
% %|| .dim
% %|| [ 3 2 1 ]
% %|| .xform
% %|| 'scanner_anat'
% %|| .sa
% %|| .targets
% %|| [ 1
% %|| 2
% %|| 3
% %|| :
% %|| 2
% %|| 3
% %|| 4 ]@200x1
% %|| .chunks
% %|| [ 1
% %|| 1
% %|| 1
% %|| :
% %|| 5
% %|| 5
% %|| 5 ]@200x1
% %|| .subject
% %|| [ 1
% %|| 1
% %|| 1
% %|| :
% %|| 10
% %|| 10
% %|| 10 ]@200x1
%
% % example of MEEG dataset with ctf151 layout
% ds=cosmo_synthetic_dataset('type','meeg',...
% 'sens','ctf151','size','huge');
% % show labels and values for feature dimensions
% cosmo_disp(ds.a.fdim,'edgeitems',2)
% %|| .labels
% %|| { 'chan'
% %|| 'time' }
% %|| .values
% %|| { { 'MLC11' 'MLC12' ... 'MZP01' 'MZP02' }@1x151
% %|| [ -0.2 -0.15 ... 0.55 0.6 ]@1x17 }
%
% % example of MEEG dataset with 4d148 layout (3 channels only)
% ds=cosmo_synthetic_dataset('type','meeg','sens','4d148_planar');
% cosmo_disp(ds.a.fdim)
% %|| .labels
% %|| { 'chan'
% %|| 'time' }
% %|| .values
% %|| { { 'A148_dH' 'A147_dH' 'A146_dH' }
% %|| [ -0.2 -0.15 ] }
%
% # For CoSMoMVPA's copyright information and license terms, #
% # see the COPYING file distributed with CoSMoMVPA. #
default = struct();
default.ntargets = 2;
default.nchunks = 3;
default.sigma = 3;
default.size = 'small';
default.type = 'fmri';
default.nreps = [];
default.nmodalities = [];
default.nsubjects = [];
default.chunks = [];
default.targets = [];
default.target1 = 1;
% for PRNG
default.seed = 1;
% for MEEG
default.sens = 'neuromag306_all';
% for MEEG source
default.data_field = 'pow'; % 'pow' or 'mom'
opt = cosmo_structjoin(default, varargin);
[ds, nfeatures] = get_fdim_fa(opt);
sa_labels_pl = {'ntargets', 'nchunks', 'nmodalities', 'nsubjects', 'nreps'};
cp = cosmo_cartprod(cellfun(@(x)1:max([opt.(x) 1]), sa_labels_pl, ...
'UniformOutput', false));
sa_labels_si = {'targets', 'chunks', 'modality', 'subject', 'rep'};
for k = 1:numel(sa_labels_si)
if ~isempty(opt.(sa_labels_pl{k}))
ds.sa.(sa_labels_si{k}) = cp(:, k);
end
end
nelem = log(nfeatures * max([opt.nreps 1]));
class_distance = opt.sigma / sqrt(nelem);
ds.samples = generate_samples(ds, class_distance, opt.seed);
ds = assign_sa(ds, opt, {'chunks', 'targets'});
ds.sa.targets = ds.sa.targets + opt.target1 - 1;
cosmo_check_dataset(ds);
function ds = assign_sa(ds, opt, names)
nsamples = size(ds.samples, 1);
for k = 1:numel(names)
name = names{k};
v = opt.(name);
if ~isempty(v)
if ~isscalar(v)
error('Value for ''%s'' must be a scalar', name);
end
ds.sa.(name) = ones(nsamples, 1) * v;
end
end
function samples = generate_samples(ds, class_distance, seed)
targets = ds.sa.targets;
nsamples = numel(targets);
nclasses = numel(unique(targets));
fa_names = fieldnames(ds.fa);
nfeatures = size(ds.fa.(fa_names{1}), 2);
r = cosmo_rand(nsamples, nfeatures, 'seed', seed);
samples = cosmo_norminv(r);
add_msk = mod(bsxfun(@minus, 1:nclasses + 1, targets), nclasses + 1) == 0;
add_msk_full = repmat(add_msk, 1, ceil(nfeatures / (nclasses + 1)));
add_msk_full = add_msk_full(:, 1:nfeatures);
samples(add_msk_full) = samples(add_msk_full) + class_distance;
function a = dim_labels_values(opt)
data_type = opt.type;
sens_type = opt.sens;
a = struct();
switch data_type
case 'fmri'
a.vol.mat = eye(4);
a.vol.mat(1:3, 1:3) = a.vol.mat(1:3, 1:3) * 2;
% ensure negative origin, so that MRIcron displays
% the matrix correctly
a.vol.mat(1:3, 4) = -3;
labels = {'i'; 'j'; 'k'};
values = {1:20; 1:20; 1:20};
case {'meeg', 'timelock', 'timefreq'}
% simulate full neuromag 306 system
chan = get_meeg_channels(sens_type);
time = (-.2:.05:1.3);
switch data_type
case 'timefreq'
freq = (2:2:40);
values = {chan; freq; time};
labels = {'chan'; 'freq'; 'time'};
a.meeg.samples_type = 'freq';
a.meeg.samples_field = 'powspctrm';
otherwise % meeg and timelock
values = {chan; time};
labels = {'chan'; 'time'};
a.meeg.samples_type = 'timelock';
a.meeg.samples_field = 'trial';
end
a.meeg.samples_label = 'rpt';
case 'surface'
labels = {'node_indices'};
values = {1:4000};
case 'source'
xyz = cosmo_cartprod({-70:10:70, -60:10:60, -110:10:110})';
[unused, i] = sort(sum(abs(xyz), 1));
switch opt.data_field
case 'pow'
labels = {'pos', 'time'};
values = {xyz(:, i), -1:.5:14};
a.meeg.samples_field = 'trial.pow';
case 'mom'
labels = {'pos', 'mom', 'time'};
values = {xyz(:, i), {'x', 'y', 'z'}, -1:.5:14};
a.meeg.samples_field = 'trial.mom';
otherwise
error('illegal data_field ''%s''', opt.data_field);
end
otherwise
error('Unsupported type ''%s''', data_type);
end
a.fdim.values = values;
a.fdim.labels = labels;
function labels = get_meeg_channels(sens_type)
% mapping from name to helper function
% each name is prefixed with 'T' to allow names that start with a digit
sens2func = struct();
sens2func.Tneuromag306 = @get_neuromag_chan;
sens2func.Tctf151 = @get_CTF151_chan;
sens2func.T4d148 = @get_4d148_chan;
sens2func.T4d248 = @get_4d248_chan;
sens2func.Teeg1020 = @get_eeg1020_chan;
sens2func.Teeg1010 = @get_eeg1010_chan;
sens2func.Teeg1005 = @get_eeg1005_chan;
sens2func.Tyokogawa64 = @get_yokogawa64_chan;
sens2func.Tyokogawa160 = @get_yokogawa160_chan;
sp = cosmo_strsplit(['T' sens_type], '_');
key = sp{1};
if ~isfield(sens2func, key)
supported_sens = fieldnames(sens2func);
supported_labels = cellfun(@(x)[x(2:end) '*'], supported_sens, ...
'UniformOutput', false);
error('Unsupported sens_type %s, supported are: %s ', ...
key(2:end), cosmo_strjoin(supported_labels, ', '));
end
func = sens2func.(key);
chan_type = [cosmo_strjoin(sp(2:end), '_') '']; % ensure string
labels = func(chan_type);
labels = labels(:)';
function labels = get_eeg1020_chan(chan_type)
labels = get_eeg10XX_chan_helper(chan_type, '1020');
function labels = get_eeg1010_chan(chan_type)
labels = get_eeg10XX_chan_helper(chan_type, '1010');
function labels = get_eeg1005_chan(chan_type)
labels = get_eeg10XX_chan_helper(chan_type, '1005');
function labels = get_eeg10XX_chan_helper(chan_type, sens_type)
assert(isempty(chan_type));
% build mapping from prefix to postfixes
ch = struct();
switch sens_type
case '1020'
ch.Fp = { '1' '2' 'z' };
ch.F = { '3' '4' '7' '8' 'z' };
ch.T = { '3' '4' '5' '6' '7' '8' };
ch.C = { '3' '4' 'z' };
ch.P = { '3' '4' '7' '8' 'z' };
ch.O = { '1' '2' 'z' };
ch.A = { '1' '2' };
ch.M = { '1' '2' };
case '1010'
ch.TP = { '10' '7' '8' '9' };
ch.CP = { '1' '2' '3' '4' '5' '6' 'z' };
ch.Fp = { '1' '2' 'z' };
ch.AF = { '1' '10' '2' '3' '4' '5' '6' '7' '8' '9' 'z' };
ch.F = { '1' '10' '2' '3' '4' '5' '6' '7' '8' '9' 'z' };
ch.FT = { '10' '7' '8' '9' };
ch.FC = { '1' '2' '3' '4' '5' '6' 'z' };
ch.T = { '10' '3' '4' '5' '6' '7' '8' '9' };
ch.C = { '1' '2' '3' '4' '5' '6' 'z' };
ch.P = { '1' '10' '2' '3' '4' '5' '6' '7' '8' '9' 'z' };
ch.PO = { '1' '10' '2' '3' '4' '5' '6' '7' '8' '9' 'z' };
ch.O = { '1' '2' 'z' };
ch.I = { '1' '2' 'z' };
ch.A = { '1' '2' };
ch.M = { '1' '2' };
case '1005'
ch.FCC = { '1' '1h' '2' '2h' '3' '3h' '4' '4h' '5' '5h' '6'...
'6h' 'z' };
ch.F = { '1' '10' '10h' '1h' '2' '2h' '3' '3h' '4' '4h'...
'5' '5h' '6' '6h' '7' '7h' '8' '8h' '9' '9h' ...
'p1' 'p1h' 'p2' 'p2h' 'pz' 'z' };
ch.AF = { '1' '10' '10h' '1h' '2' '2h' '3' '3h' '4' '4h'...
'5' '5h' '6' '6h' '7' '7h' '8' '8h' '9' '9h' ...
'p1' 'p10' 'p10h' 'p1h' 'p2' 'p2h' 'p3' 'p3h' ...
'p4' 'p4h' 'p5' 'p5h' 'p6' 'p6h' 'p7' 'p7h' ...
'p8' 'p8h' 'p9' 'p9h' 'pz' 'z' };
ch.FT = { '10' '10h' '7' '7h' '8' '8h' '9' '9h' };
ch.FC = { '1' '1h' '2' '2h' '3' '3h' '4' '4h' '5' '5h'...
'6' '6h' 'z' };
ch.T = { '10' '10h' '3' '4' '5' '6' '7' '7h' '8' '8h'...
'9' '9h' };
ch.C = { '1' '1h' '2' '2h' '3' '3h' '4' '4h' '5' '5h'...
'6' '6h' 'z' };
ch.TP = { '10' '10h' '7' '7h' '8' '8h' '9' '9h' };
ch.CP = { '1' '1h' '2' '2h' '3' '3h' '4' '4h' '5' '5h'...
'6' '6h' 'z' };
ch.P = { '1' '10' '10h' '1h' '2' '2h' '3' '3h' '4' '4h'...
'5' '5h' '6' '6h' '7' '7h' '8' '8h' '9' '9h' 'z' };
ch.PO = { '1' '10' '10h' '1h' '2' '2h' '3' '3h' '4' '4h'...
'5' '5h' '6' '6h' '7' '7h' '8' '8h' '9' '9h' 'z' };
ch.O = { '1' '1h' '2' '2h' 'z' };
ch.I = { '1' '1h' '2' '2h' 'z' };
ch.AFF = { '1' '10' '10h' '1h' '2' '2h' '3' '3h' '4' '4h'...
'5' '5h' '6' '6h' '7' '7h' '8' '8h' '9' '9h' 'z' };
ch.FFT = { '10' '10h' '7' '7h' '8' '8h' '9' '9h' };
ch.FFC = { '1' '1h' '2' '2h' '3' '3h' '4' '4h' '5' '5h' '6'...
'6h' 'z' };
ch.FTT = { '10' '10h' '7' '7h' '8' '8h' '9' '9h' };
ch.TTP = { '10' '10h' '7' '7h' '8' '8h' '9' '9h' };
ch.CCP = { '1' '1h' '2' '2h' '3' '3h' '4' '4h' '5' '5h' '6'...
'6h' 'z' };
ch.TPP = { '10' '10h' '7' '7h' '8' '8h' '9' '9h' };
ch.CPP = { '1' '1h' '2' '2h' '3' '3h' '4' '4h' '5' '5h' '6'...
'6h' 'z' };
ch.PPO = { '1' '10' '10h' '1h' '2' '2h' '3' '3h' '4' '4h' '5'...
'5h' '6' '6h' '7' '7h' '8' '8h' '9' '9h' 'z' };
ch.POO = { '1' '10' '10h' '1h' '2' '2h' '3' '3h' '4' '4h' '5'...
'5h' '6' '6h' '7' '7h' '8' '8h' '9' '9h' 'z' };
ch.OI = { '1' '1h' '2' '2h' 'z' };
ch.A = { '1' '2' };
ch.M = { '1' '2' };
otherwise
error('illegal senstype %s', sens_type);
end
prefixes = fieldnames(ch);
nprefix = numel(prefixes);
label_cell = cell(nprefix, 1);
for k = 1:nprefix
prefix = prefixes{k};
postfixes = ch.(prefix);
npostfix = numel(postfixes);
prefix_labels = cell(npostfix, 1);
for j = 1:npostfix
postfix = postfixes{j};
prefix_labels{j} = [prefix postfix];
end
label_cell{k} = prefix_labels;
end
labels = cat(1, label_cell{:});
function labels = get_4d248_chan(chan_type)
labels = get_4dX48_chan_helper(chan_type, 248);
function labels = get_4d148_chan(chan_type)
labels = get_4dX48_chan_helper(chan_type, 148);
function labels = get_yokogawa64_chan(chan_type)
labels = get_yokogawaX_chan_helper(chan_type, 64);
function labels = get_yokogawa160_chan(chan_type)
labels = get_yokogawaX_chan_helper(chan_type, 160);
function labels = get_yokogawaX_chan_helper(chan_type, nchan)
labels = get_general_chan_helper(chan_type, nchan, ...
'AG%03d', 'AG%03d_dH', 'AG%03d_dV');
function labels = get_4dX48_chan_helper(chan_type, nchan)
labels = get_general_chan_helper(chan_type, nchan, ...
'A%d', 'A%d_dH', 'A%d_dV');
function labels = get_general_chan_helper(chan_type, chan_vals, ...
pat_combined, pat_planar1, pat_planar2)
% if chan_vals is a scalar, it indicates the number of channels
if isnumeric(chan_vals) && numel(chan_vals) == 1
% take the last channels
chan_vals = num2cell(chan_vals:-1:1);
end
generate = @(pat) cellfun(@(x)sprintf(pat, x), chan_vals, ...
'UniformOutput', false);
switch chan_type
case {'', 'planar_combined'}
labels = generate(pat_combined)';
case 'planar'
labels = [generate(pat_planar1) generate(pat_planar2)]';
otherwise
error('illegal chan type %s', chan_type);
end
function labels = get_CTF151_chan(chan_type)
chan_vals = get_CTF151_chan_prefixes();
labels = get_general_chan_helper(chan_type, chan_vals, ...
'%s', '%s_dH', '%s_dV');
function labels = get_CTF151_chan_prefixes()
% get channels for CTF151
lats = 'LRZ'; % lateralities
locs = 'CFOPT'; % brain part
counts = [repmat({{[0, 5, 4, 3, 3], [0, 2, 3, 4, 5, 2], [0, 2, 2, 3, 3], ...
[0, 3, 2, 4], [0, 6, 6, 5, 4]}}, 2, 1); ...
{{2, 3, 2, 2}}];
labels = cell(1, 151);
pos = 0;
for i = 1:numel(lats)
lat = lats(i);
count = counts{i};
for k = 1:numel(count)
loc = locs(k);
cs = count{k};
for j = 1:numel(cs)
for m = 1:cs(j)
label = sprintf('M%s%s%d%d', lat, loc, j - 1, m);
pos = pos + 1;
labels{pos} = label;
end
end
end
end
function labels = get_neuromag_chan(chan_type)
% get channels for neuromag system
all_chan_idxs = [repmat((1:26)', 4, 1) kron((1:4)', ones(26, 1))];
remove_msk = cosmo_match(all_chan_idxs(:, 1), 8) & ...
cosmo_match(all_chan_idxs(:, 2), [3 4]);
chan_idxs = all_chan_idxs(~remove_msk, :);
nloc = size(chan_idxs, 1);
assert(nloc == 102);
infix = reshape(sprintf('%02d%d', chan_idxs'), 3, [])';
as_column = @(x) repmat(x, nloc, 1);
meg = as_column('MEG');
one = as_column('1');
two = as_column('2');
three = as_column('3');
plus_ = as_column('+');
labels = cell(nloc, 4);
labels(:, 1) = cellstr([meg infix one]);
labels(:, 2) = cellstr([meg infix two]);
labels(:, 3) = cellstr([meg infix three]);
labels(:, 4) = cellstr([meg infix two plus_ infix three]);
keep_col = false(1, 4);
types = cosmo_strsplit(chan_type, '+');
for k = 1:numel(types)
tp = types{k};
switch tp
case {'all', ''}
keep_col([1 2 3]) = true;
case 'planar_combined'
keep_col(4) = true;
case 'planar'
keep_col([2 3]) = true;
case 'axial'
keep_col(1) = true;
otherwise
error('Unsupported channel type %s', tp);
end
end
if any(keep_col(2:3)) && keep_col(4)
error('planar and cmb/combined are mutually exclusive');
end
labels = labels(:, keep_col)';
labels = labels(:);
function dim_size = get_dim_size(size_label, opt)
% get dimension size. NaN means use input size
size2dim = struct();
size2dim.tiny = [2 1 1];
size2dim.small = [3 2 1];
size2dim.normal = [3 2 5];
size2dim.big = [NaN 7 5];
size2dim.huge = [NaN 17 19];
if ~isfield(size2dim, size_label)
error('Unsupported size %s. Supported are: %s', ...
size_label, cosmo_strjoin(fieldnames(size2dim), ', '));
end
dim_size = size2dim.(size_label);
if strcmp(opt.type, 'source')
dim_size(1) = 4;
if strcmp(opt.data_field, 'mom')
dim_size(2) = 3;
end
end
function [ds, nfeatures] = get_fdim_fa(opt)
data_type = opt.type;
size_label = opt.size;
% get dimension labels
a = dim_labels_values(opt);
labels = a.fdim.labels;
values = a.fdim.values;
a = rmfield(a, 'fdim');
% get dimension values
dim_sizes = get_dim_size(size_label, opt);
nlabel = numel(labels);
if nlabel == 1
% surface case
dim_sizes = prod(dim_sizes);
else
% everything else
dim_sizes = dim_sizes(1:nlabel);
end
for k = 1:numel(dim_sizes)
if isnan(dim_sizes(k))
dim_sizes(k) = size(values{k}, 2);
end
if isvector(values{k})
values{k} = values{k}(1:dim_sizes(k));
else
values{k} = values{k}(:, 1:dim_sizes(k));
end
end
% get ds with proper size
ds = cosmo_flatten(zeros([1 dim_sizes]), labels, values, 2, ...
'matrix_labels', {'pos'});
ds.a = cosmo_structjoin(ds.a, a);
switch data_type
case 'fmri'
ds.a.vol.dim = dim_sizes;
ds.a.vol.xform = 'scanner_anat';
end
nfeatures = prod(dim_sizes);