cosmo normalize skl

function [ds, params]=cosmo_normalize(ds, params, dim)
% normalize dataset either by estimating or applying estimated parameters
%
% [ds, est_params]=cosmo_normalize(ds, norm_type[, dim])
%
% Inputs
%   ds            a dataset struct with field .samples of size PxQ, or a
%                 numeric array of that size
%   params        either the type of normalization:
%                   - 'demean'     (mean of zero)
%                   - 'zscore'     (demean and unit variance)
%                   - 'scale_unit' (scale to interval [-1,1])
%                 -or-
%                 previously estimated parameters using the 'params'
%                 output result from a previous call to this function.
%   dim           1 or 2, indicating along with dimension of ds to
%                 normalize (if params it not a string and ends with '1' or
%                 2').
%
% Output
%   ds_norm       a dataset struct similar to ds, but with .samples data
%                 normalized. If the input was a numeric array then ds_norm
%                 is a numeric array as well.
%   params        estimated parameters for normalization. These can be
%                 re-used for a second normalization step of an independant
%                 dataset. For example, parameters can be estimated from a
%                 training dataset and then applied to a testing dataset
%
%
% Examples:
%     ds=struct();
%     ds.samples=reshape(1:15,5,3)*2;
%     cosmo_disp(ds);
%     %|| .samples
%     %||   [  2        12        22
%     %||      4        14        24
%     %||      6        16        26
%     %||      8        18        28
%     %||     10        20        30 ]
%     %
%     % demean along first dimension
%     dsn=cosmo_normalize(ds,'demean',1);
%     cosmo_disp(dsn);
%     %|| .samples
%     %||   [ -4        -4        -4
%     %||     -2        -2        -2
%     %||      0         0         0
%     %||      2         2         2
%     %||      4         4         4 ]
%     %
%     % demean along second dimension
%     dsn=cosmo_normalize(ds,'demean',2);
%     cosmo_disp(dsn);
%     %|| .samples
%     %||   [ -10         0        10
%     %||     -10         0        10
%     %||     -10         0        10
%     %||     -10         0        10
%     %||     -10         0        10 ]
%     %
%     % scale to range [-1,1] along first dimension
%     dsn=cosmo_normalize(ds,'scale_unit',1);
%     cosmo_disp(dsn);
%     %|| .samples
%     %||   [   -1        -1        -1
%     %||     -0.5      -0.5      -0.5
%     %||        0         0         0
%     %||      0.5       0.5       0.5
%     %||        1         1         1 ]
%     %
%     % z-score along first dimension
%     dsn=cosmo_normalize(ds,'zscore',1);
%     cosmo_disp(dsn);
%     %|| .samples
%     %||   [  -1.26     -1.26     -1.26
%     %||     -0.632    -0.632    -0.632
%     %||          0         0         0
%     %||      0.632     0.632     0.632
%     %||       1.26      1.26      1.26 ]
%     %
%     % z-score along second dimension
%     dsn=cosmo_normalize(ds,'zscore',2);
%     cosmo_disp(dsn);
%     %|| .samples
%     %||   [ -1         0         1
%     %||     -1         0         1
%     %||     -1         0         1
%     %||     -1         0         1
%     %||     -1         0         1 ]
%     %
%     % use samples 1, 3, and 4 to estimate parameters ('training set'),
%     % and apply these to samples 2 and 5
%     ds_train=cosmo_slice(ds,[1 3 4]);
%     ds_test=cosmo_slice(ds,[2 5]);
%     [dsn_train,params]=cosmo_normalize(ds_train,'scale_unit', 1);
%     cosmo_disp(dsn_train);
%     %|| .samples
%     %||   [    -1        -1        -1
%     %||     0.333     0.333     0.333
%     %||         1         1         1 ]
%     %
%     % show estimated parameters (min and max for each column, in this
%     % case)
%     cosmo_disp(params);
%     %|| .method
%     %||   'scale_unit'
%     %|| .dim
%     %||   [ 1 ]
%     %|| .min
%     %||   [ 2        12        22 ]
%     %|| .max
%     %||   [ 8        18        28 ]
%     %
%     % apply parameters to test dataset
%     dsn_test=cosmo_normalize(ds_test,params);
%     cosmo_disp(dsn_test);
%     %|| .samples
%     %||   [ -0.333    -0.333    -0.333
%     %||       1.67      1.67      1.67 ]
%
% #   For CoSMoMVPA's copyright information and license terms,   #
% #   see the COPYING file distributed with CoSMoMVPA.           #

if isempty(params) || (ischar(params) && strcmp(params,'none'))
    return;
end

if nargin<3, dim=[]; end

apply_params=isstruct(params);
if apply_params;
    if ~isempty(dim) && params.dim~=dim
        error('Dim specified as %d, but estimates used %d',dim,params.dim);
    end
elseif ischar(params)
    params=get_params_from_string(params,dim);
else
    error('norm_spec must be struct or string');
end

method=params.method;
dim=params.dim;


is_ds=isstruct(ds) && isfield(ds,'samples');

if is_ds
    samples=ds.samples;
else
    samples=ds;
end

switch method
    case 'demean'
        if apply_params
            mu=params.mu;
        else
            mu=mean(samples,dim);
            params.mu=mu;
        end
        samples=bsxfun(@minus,samples,mu);

    case 'zscore'
        if apply_params
            mu=params.mu;
            sigma=params.sigma;
        else
            n=size(samples,dim);
            mu=sum(samples,dim)/n;
            params.mu=mu;
        end

        d=bsxfun(@minus,samples,mu);

        if ~apply_params
            sigma=sqrt(sum(d.^2,dim)/(n-1));
            params.sigma=sigma;
        end

        if isempty(sigma)
            % Octave 3.8.2 on Ubuntu gives it the wrong size
            sigma=zeros(size(d));
        end

        % In Octave, std can give non-NaN even with NaN input
        nan_mask=any(isnan(samples),dim);
        sigma(nan_mask)=NaN;

        samples=bsxfun(@rdivide,d,sigma);

    case 'scale_unit'
        if apply_params
            min_=params.min;
            max_=params.max;
        else
            min_=min(samples,[],dim);
            max_=max(samples,[],dim);
            params.min=min_;
            params.max=max_;

        end
        samples=bsxfun(@times,1./(max_-min_),...
                            (bsxfun(@minus,samples,min_)))*2-1;

    otherwise
        error('Unsupported normalization: %s', method);

end

nan_msk=~isfinite(samples);
if any(nan_msk(:))
    cosmo_warning('%d samples are NaN or Inf after normalization', ...
                        sum(nan_msk(:)));
end

if is_ds
    ds.samples=samples;
else
    ds=samples;
end

function params=get_params_from_string(norm_method,dim_arg)
    persistent cached_params;
    persistent cached_norm_method;
    persistent cached_dim_arg;


    if ~(isequal(norm_method, cached_norm_method) && ...
            isequal(dim_arg,cached_dim_arg))

        assert(ischar(norm_method));
        assert(numel(norm_method)>0);

        dim=[];
        method=norm_method;

        if isempty(dim_arg)
            if isempty(dim)
                dim=1;
            end
        else
            if isempty(dim)
                dim=dim_arg;
            else
                error(['Cannot have third argument when second argument '...
                            'ends with a number']);
            end
        end

        cached_params=struct();
        cached_params.method=method;
        cached_params.dim=dim;

        cached_norm_method=norm_method;
        cached_dim_arg=dim_arg;
    end

    params=cached_params;