% EXAMPLE_DENOISING Example of use of software routines for the separation % of one speech source and diffuse background noise % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Copyright 2011 Emmanuel Vincent % This software is distributed under the terms of the GNU Public License % version 3 (http://www.gnu.org/licenses/gpl.txt) % If you find it useful, please cite the following reference: % C. Blandin, A. Ozerov and E. Vincent, "Multi-source TDOA estimation in % reverberant audio using angular spectra and clustering", Signal % Processing, to appear. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Input time-frequency representation [x,fs]=wavread('dev/dev_Sq1_Co_B_mix.wav'); nsampl=length(x); wlen=1024; X=stft_multi(x.',wlen); [nbin,nfram,~]=size(X); f=fs/wlen*(0:wlen/2).'; % TDOA estimation by GCC-PHATmax d=0.086; % microphone spacing c=343; % sound speed ngrid=181; tau_grid=linspace(-d/c,d/c,ngrid); spec=zeros(nbin,nfram,ngrid); P=X(:,:,1).*conj(X(:,:,2)); P=P./abs(P); for ind=1:ngrid, spec(:,:,ind) = real(P.*repmat(exp(-2*1i*pi*tau_grid(ind)*f),1,nfram)); end spec = shiftdim(max(sum(spec,1),[],2)); [~,tau]=max(spec); tau=tau_grid(tau); % TDOA in seconds % ML target and noise variance estimation under a diffuse noise model SINC=sinc(2*f*d/c); SINC2 = SINC.^2; EXP=exp(-2i*pi*tau*f); P=SINC.*EXP; invA11=sqrt(.5)./(1-real(P)).*(1-conj(P)); invA12=-(1-P)./(SINC-EXP).*invA11; DEN=.5./(1-2*real(P)+SINC2); invL12=(SINC2-1).*DEN; invL22=2*(1-real(P)).*DEN; ARA1=repmat(abs(invA11).^2,1,nfram).*abs(X(:,:,1)).^2+repmat(abs(invA12).^2,1,nfram).*abs(X(:,:,2)).^2; ARA2=ARA1-2*real(repmat(invA11.*invA12,1,nfram).*X(:,:,2).*conj(X(:,:,1))); ARA1=ARA1+2*real(repmat(invA11.*conj(invA12),1,nfram).*X(:,:,1).*conj(X(:,:,2))); vs=.5*ARA1+repmat(invL12,1,nfram).*ARA2; vb=repmat(invL22,1,nfram).*ARA2; neg=(vs < 0) | (vb < 0); vs(neg)=0; % Source separation via multichannel Wiener filtering SE=zeros(nbin,nfram); IE=zeros(nbin,nfram,2,2); for f=2:nbin, for t=1:nfram, Rb=vb(f,t)*[1 SINC(f); SINC(f) 1]; As=[1; EXP(f)]; Rx=vs(f,t)*(As*As')+Rb; SE(f,t)=vs(f,t)*As'/Rx*squeeze(X(f,t,:)); IE(f,t,1,:)=As*SE(f,t); IE(f,t,2,:)=Rb/Rx*squeeze(X(f,t,:)); end end IE(1,:,1,:)=.5*X(1,:,:); IE(1,:,2,:)=.5*X(1,:,:); se=istft_multi(SE,nsampl); % estimated speech signal wavwrite(se.',16000,'results/dev_Sq1_Co_B_src.wav'); ie=istft_multi(IE,nsampl); % estimated speech and noise spatial images wavwrite(reshape(ie(1,:,:),nsampl,2),16000,'results/dev_Sq1_Co_B_sim.wav'); wavwrite(reshape(ie(2,:,:),nsampl,2),16000,'results/dev_Sq1_Co_B_noi.wav'); % Evaluation of the estimated source s=wavread('dev/dev_Sq1_Co_B_src.wav').'; n=wavread('dev/dev_Sq1_Co_B_noi.wav').'; [SDR,SIR,SAR]=bss_eval_source_denoising(se,s,n); % Evaluation of the estimated source images i=zeros(2,nsampl,2); i(1,:,:)=wavread('dev/dev_Sq1_Co_B_sim.wav'); i(2,:,:)=n.'; [SDRi,ISRi,SIRi,SARi]=bss_eval_images_nosort(ie,i); % Ideal binary masking benchmark (STFT) b1=sep_ibm(x.',i); [SDRb1,ISRb1,SIRb1,SARb1]=bss_eval_images_nosort(b1,i); % Ideal binary masking benchmark (Cochleagram) b2=zeros(2,nsampl,2); for j=1:2, ibm=generateIBM(reshape(i(j,:,:),nsampl,2),reshape(i(3-j,:,:),nsampl,2)); b2(j,:,:)=generateEstimate(x,ibm); end [SDRb2,ISRb2,SIRb2,SARb2]=bss_eval_images_nosort(b2,i);