17-03-2017, 04:44 PM
Speech recognition is the process of automatically recognizing a particular word pronounced by a particular person based on the individual information included in the voice waves. This technique makes it possible to use the "
To verify identity and provide controlled access to services such as voice-based biometrics, database access services, voice-based dialing, voicemail and remote computer access. The processing of front end signals to extract the feature set is an important step in any speech recognition system. The set of optimal characteristics has not yet been decided despite the great efforts of the researchers. There are many types of features, which are derived differently and have a good impact on the recognition rate. This project presents one of the techniques to extract the set of features of the a speech signal, which can be used in voice recognition systems.
This program implements a basic speech recognition for 6 symbols using MFCC and LPC:
Code:
% American Sign Language Detection-Speech
% Performs testing one 1 sample of the speech database.
clc
clear all
close all
%% Add training and test directories to path
addpath('./Training/')
cAlpha = [{'A'},{'B'},{'C'},{'Five'},{'Point'},{'V'}];%No of alphabet used
%% Load Training set
load Fan_all.mat
load Neeraj_all.mat
load Zhang_all.mat
InputP = input('Enter person whose voice you want to use for input{Neeraj,Fan,Zhang)-','s');
InputC = input('Enter Symbol for input No(A,B,C,Point,Five,V)-','s');
InputN = input('Enter Sample No(1-25)-','s');
InputIn = strcat(InputP,'_',InputC,InputN);
%% Set variables
NoOfSamples = 25;
Users = {'Fan','Neeraj','Zhang'};
Letters = {'A','B','C','Five','Point','V'};
%MFCC parameters
OverlapSize = 0.5;
MFCCNo = 45;
NoOfWindows = 25;
NoOfFilters = floor(MFCCNo/NoOfWindows+1);
mfcc = zeros(size(Letters,2),size(Users,2)*NoOfSamples,MFCCNo);
%% MFCC Training for all letters
for ii = 1:size(Letters,2);
for jj = 1:size(Users,2);
for kk = 1:NoOfSamples
file_name = strcat(Users(jj),'_',Letters(ii),int2str(kk));
Samples = eval(char(file_name));
zz = find(Samples) < max(Samples/3);%Threshold speech regions
Samples(zz) = 0;
zz = find(Samples);
Speech_Region = Samples(zz)/norm(Samples(zz));
WindowSize = floor((size(Speech_Region,1))/(NoOfWindows+1));
ww = 0;
for ll = 0:OverlapSize:(NoOfWindows-1)/2
bb = Speech_Region(floor(ll*WindowSize)+1:floor(ll*WindowSize)+WindowSize).*hamming(WindowSize);
fb = fft(bb);
mb = 2595 * log10(1 + fb./700);
mfout = dct(log(abs(mb)),NoOfFilters);
mfcc(ii,kk,ww*NoOfFilters+1:ww*NoOfFilters+NoOfFilters) = mfout;
ww = ww + 1;
end
end
end
end
%% Perform Gaussian Modelling for MFCC
Windows = size(mfcc,3);
tempStorage = zeros(size(Users,2)*NoOfSamples,Windows);
tempStorage(:,:) = mfcc(1,:,:);
obj_A = gmdistribution.fit(tempStorage,1,'Regularize',0.01);
tempStorage(:,:) = mfcc(2,:,:);
obj_B = gmdistribution.fit(tempStorage,1,'Regularize',0.01);
tempStorage(:,:) = mfcc(3,:,:);
obj_C = gmdistribution.fit(tempStorage,1,'Regularize',0.01);
tempStorage(:,:) = mfcc(4,:,:);
obj_Five = gmdistribution.fit(tempStorage,1,'Regularize',0.01);
tempStorage(:,:) = mfcc(5,:,:);
obj_Point = gmdistribution.fit(tempStorage,1,'Regularize',0.01);
tempStorage(:,:) = mfcc(6,:,:);
obj_V = gmdistribution.fit(tempStorage,1,'Regularize',0.01);
%% Extract MFCC for test data
Samples = eval(char(InputIn));
zz = find(Samples) < max(Samples/3);%Threshold speech regions
Samples(zz) = 0;
zz = find(Samples);
Speech_Region = Samples(zz);
mfcc_test = zeros(1,MFCCNo);
WindowSize = floor((size(Speech_Region,1))/(NoOfWindows+1));
ww = 0;
for ll = 0:OverlapSize:(NoOfWindows-1)/2
bb = Speech_Region(floor(ll*WindowSize)+1:floor(ll*WindowSize)+WindowSize).*hamming(WindowSize);
fb = fft(bb);
mb = 2595 * log10(1 + fb./700);
mfout = dct(log(abs(mb)),NoOfFilters);
mfcc_test(1,ww*NoOfFilters+1:ww*NoOfFilters+NoOfFilters) = mfout;
ww = ww + 1;
end
%% Classify MFCC test data on Mahanalobis distance
D1(1) = mahal(obj_A,mfcc_test);
D1(2) = mahal(obj_B,mfcc_test);
D1(3) = mahal(obj_C,mfcc_test);
D1(4) = mahal(obj_Five,mfcc_test);
D1(5) = mahal(obj_Point,mfcc_test);
D1(6) = mahal(obj_V,mfcc_test);
[m Ind] = min(D1);
% LPC parameters
NoOfLPCFilters = 50;
lpccoeff = zeros(size(Letters,2),size(Users,2)*NoOfSamples,NoOfLPCFilters+1);
%% LPC Training for all letters
% Training for all letters
for ii = 1:size(Letters,2);
ll = 1;
for jj = 1:size(Users,2);
for kk = 1:NoOfSamples
file_name = strcat(Users(jj),'_',Letters(ii),int2str(kk));
Samples = eval(char(file_name));
zz = find(Samples) < max(Samples/3);%Threshold speech regions
Samples(zz) = 0;
zz = find(Samples);
Speech_Region = Samples(zz)/norm(Samples(zz));
lpccoeff(ii,ll,:) = lpc(Speech_Region,NoOfLPCFilters);
ll = ll + 1;
end
end
end
%% Perform Gaussian Modelling for LPC
tempStorage = zeros(size(Users,2)*NoOfSamples,NoOfLPCFilters);
tempStorage(:,:) = lpccoeff(1,:,2:end);
obj_A2 = gmdistribution.fit(tempStorage,1);
tempStorage(:,:) = lpccoeff(2,:,2:end);
obj_B2 = gmdistribution.fit(tempStorage,1);
tempStorage(:,:) = lpccoeff(3,:,2:end);
obj_C2 = gmdistribution.fit(tempStorage,1);
tempStorage(:,:) = lpccoeff(4,:,2:end);
obj_Five2 = gmdistribution.fit(tempStorage,1);
tempStorage(:,:) = lpccoeff(5,:,2:end);
obj_Point2 = gmdistribution.fit(tempStorage,1);
tempStorage(:,:) = lpccoeff(6,:,2:end);
obj_V2 = gmdistribution.fit(tempStorage,1);
%% Perform Gaussian Modelling for LPC
tempStorage = zeros(size(Users,2)*NoOfSamples,NoOfLPCFilters);
tempStorage(:,:) = lpccoeff(1,:,2:end);
obj_A2 = gmdistribution.fit(tempStorage,1);
tempStorage(:,:) = lpccoeff(2,:,2:end);
obj_B2 = gmdistribution.fit(tempStorage,1);
tempStorage(:,:) = lpccoeff(3,:,2:end);
obj_C2 = gmdistribution.fit(tempStorage,1);
tempStorage(:,:) = lpccoeff(4,:,2:end);
obj_Five2 = gmdistribution.fit(tempStorage,1);
tempStorage(:,:) = lpccoeff(5,:,2:end);
obj_Point2 = gmdistribution.fit(tempStorage,1);
tempStorage(:,:) = lpccoeff(6,:,2:end);
obj_V2 = gmdistribution.fit(tempStorage,1);
%% Extract LPC for test data
Samples = eval(char(InputIn));
zz = find(Samples) < max(Samples/3);%Threshold speech regions
Samples(zz) = 0;
zz = find(Samples);
Speech_Region = Samples(zz);
ww = 1;
lpc_test = lpc(Speech_Region,NoOfLPCFilters);
%% Classify LPC test data on Mahanalobis distance
D2(1) = mahal(obj_A2,lpc_test(2:end));
D2(2) = mahal(obj_B2,lpc_test(2:end));
D2(3) = mahal(obj_C2,lpc_test(2:end));
D2(4) = mahal(obj_Five2,lpc_test(2:end));
D2(5) = mahal(obj_Point2,lpc_test(2:end));
D2(6) = mahal(obj_V2,lpc_test(2:end));
[m Ind2] = min(D2);
%% Display output
f = figure();
set(gca, 'fontsize', 28);
set(f,'name','Recognized Letter')
subplot (1,2,1)
RecongImg = strcat(cAlpha(Ind),'-train1.jpg');
imshow(char(RecongImg));
title(strcat('Recognized Letter using MFCC-',cAlpha(Ind)),'fontsize',20);
subplot (1,2,2)
RecongImg = strcat(cAlpha(Ind2),'-train1.jpg');
imshow(char(RecongImg));
title(strcat('Recognized Letter using LPC-',cAlpha(Ind2)),'fontsize',20);
display('Input is');
wavplay(eval(char(InputIn)),fs);