% EXAMPLE_DENOISING Example of use of software routines for the separation
% of one speech source and diffuse background noise
% 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Copyright 2011 Emmanuel Vincent
% This software is distributed under the terms of the GNU Public License
% version 3 (http://www.gnu.org/licenses/gpl.txt)
% If you find it useful, please cite the following reference:
% C. Blandin, A. Ozerov and E. Vincent, "Multi-source TDOA estimation in
% reverberant audio using angular spectra and clustering", Signal
% Processing, to appear.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Input time-frequency representation
[x,fs]=wavread('dev/dev_Sq1_Co_B_mix.wav');
nsampl=length(x);
wlen=1024;
X=stft_multi(x.',wlen);
[nbin,nfram,~]=size(X);
f=fs/wlen*(0:wlen/2).';

% TDOA estimation by GCC-PHATmax
d=0.086;    % microphone spacing
c=343;      % sound speed
ngrid=181;
tau_grid=linspace(-d/c,d/c,ngrid);
spec=zeros(nbin,nfram,ngrid);
P=X(:,:,1).*conj(X(:,:,2));
P=P./abs(P);
for ind=1:ngrid,
    spec(:,:,ind) = real(P.*repmat(exp(-2*1i*pi*tau_grid(ind)*f),1,nfram));
end
spec = shiftdim(max(sum(spec,1),[],2));
[~,tau]=max(spec);
tau=tau_grid(tau);  % TDOA in seconds

% ML target and noise variance estimation under a diffuse noise model
SINC=sinc(2*f*d/c);
SINC2 = SINC.^2;
EXP=exp(-2i*pi*tau*f);
P=SINC.*EXP;
invA11=sqrt(.5)./(1-real(P)).*(1-conj(P));
invA12=-(1-P)./(SINC-EXP).*invA11;
DEN=.5./(1-2*real(P)+SINC2);
invL12=(SINC2-1).*DEN;
invL22=2*(1-real(P)).*DEN;
ARA1=repmat(abs(invA11).^2,1,nfram).*abs(X(:,:,1)).^2+repmat(abs(invA12).^2,1,nfram).*abs(X(:,:,2)).^2;
ARA2=ARA1-2*real(repmat(invA11.*invA12,1,nfram).*X(:,:,2).*conj(X(:,:,1)));
ARA1=ARA1+2*real(repmat(invA11.*conj(invA12),1,nfram).*X(:,:,1).*conj(X(:,:,2)));
vs=.5*ARA1+repmat(invL12,1,nfram).*ARA2;
vb=repmat(invL22,1,nfram).*ARA2;
neg=(vs < 0) | (vb < 0);
vs(neg)=0;

% Source separation via multichannel Wiener filtering
SE=zeros(nbin,nfram);
IE=zeros(nbin,nfram,2,2);
for f=2:nbin,
    for t=1:nfram,
        Rb=vb(f,t)*[1 SINC(f); SINC(f) 1];
        As=[1; EXP(f)];
        Rx=vs(f,t)*(As*As')+Rb;
        SE(f,t)=vs(f,t)*As'/Rx*squeeze(X(f,t,:));
        IE(f,t,1,:)=As*SE(f,t);
        IE(f,t,2,:)=Rb/Rx*squeeze(X(f,t,:));
    end
end
IE(1,:,1,:)=.5*X(1,:,:);
IE(1,:,2,:)=.5*X(1,:,:);
se=istft_multi(SE,nsampl);  % estimated speech signal
wavwrite(se.',16000,'results/dev_Sq1_Co_B_src.wav');
ie=istft_multi(IE,nsampl);  % estimated speech and noise spatial images
wavwrite(reshape(ie(1,:,:),nsampl,2),16000,'results/dev_Sq1_Co_B_sim.wav');
wavwrite(reshape(ie(2,:,:),nsampl,2),16000,'results/dev_Sq1_Co_B_noi.wav');

% Evaluation of the estimated source
s=wavread('dev/dev_Sq1_Co_B_src.wav').';
n=wavread('dev/dev_Sq1_Co_B_noi.wav').';
[SDR,SIR,SAR]=bss_eval_source_denoising(se,s,n);

% Evaluation of the estimated source images
i=zeros(2,nsampl,2);
i(1,:,:)=wavread('dev/dev_Sq1_Co_B_sim.wav');
i(2,:,:)=n.';
[SDRi,ISRi,SIRi,SARi]=bss_eval_images_nosort(ie,i);

% Ideal binary masking benchmark (STFT)
b1=sep_ibm(x.',i);
[SDRb1,ISRb1,SIRb1,SARb1]=bss_eval_images_nosort(b1,i);

% Ideal binary masking benchmark (Cochleagram)
b2=zeros(2,nsampl,2);
for j=1:2,
    ibm=generateIBM(reshape(i(j,:,:),nsampl,2),reshape(i(3-j,:,:),nsampl,2));
    b2(j,:,:)=generateEstimate(x,ibm);
end
[SDRb2,ISRb2,SIRb2,SARb2]=bss_eval_images_nosort(b2,i);