A Speech Recognition Extension to Snack

This is an example of how a speech recognizer and Snack can be interfaced. The recognizer itself is not provided (the RT*() functions). This interface makes use of the Snack API to track changes to a sound object and to process its contents as it changes, even in pseudo realtime.

A minimal script using this package is shown below



package require snack
package require starlite

# Create a new Snack sound object
sound snd -frequency 8000

pack [ button .a -text Record -command {snd record} ]
pack [ button .b -text Stop   -command {snd stop} ]

# Create a recognizer linked to the sound object with a simple callback (puts) that just prints the recognizer output each time the sound changes.

recognizer test.LEX tri.HMM snd puts



Each time a new utterance has been recorded and the stop button clicked, the resulting output of the speech recognizer is printed. Also, if for example this command is issued

snd read sentence.wav

the file sentence.wav is read and processed and the resulting output of the speech recognizer will be printed.
It is possible to set up several speech recognizers in parallell tracking the same sound object. These recognizers can have different grammars or even acoustic models trained on different languages.


Implementation of the interface

#include "RTStar.h"
#include "System.h"
#include "tcl.h"
#include "snack.h"

typedef struct Recog {
  RTStarEngine *engine;
  RTStarUtterance *utterance;
  Sound *snd;
  int id;
  Tcl_Obj *cmdPtr;
  Tcl_Interp *interp;
  int forwardPos;
  int nBest;
  int outputFormat;
} Recog;

/*
  The function that handles the sub-commands of a recognition object command.
 */

static int
engine_cmd(ClientData clientData, Tcl_Interp *interp, int objc,
     Tcl_Obj *CONST objv[])
{
  Recog *r = (Recog *)clientData;
  int index;
  static char *optionStrings[] = {
    "configure", "destroy", NULL
  };
  enum options {
    CONFIG, DESTROY
  };

  if (objc < 2) {
    return TCL_ERROR;
  }

  if (Tcl_GetIndexFromObj(interp, objv[1], optionStrings, "option", 0,
     &index) != TCL_OK) {
    return TCL_ERROR;
  }

  switch ((enum options) index) {
  case CONFIG:
    {
      char *string1 = Tcl_GetStringFromObj(objv[2], NULL);
      char *string2 = Tcl_GetStringFromObj(objv[3], NULL);

      if (strcmp(string1, "NBEST") == 0) {
 if (Tcl_GetIntFromObj(interp, objv[3], &r->nBest) != TCL_OK) {
   return TCL_ERROR;
 }
      }
      else if (strcmp(string1, "OUTFORMAT") == 0) {
 if (strcmp(string2, "RTO_WORDS") == 0) {
   r->outputFormat = RTO_WORDS;
 }
 if (strcmp(string2, "RTO_WnS") == 0) {
   r->outputFormat = RTO_WnS;
 }
 if (strcmp(string2, "RTO_MIX") == 0) {
   r->outputFormat = RTO_MIX;
 }
 if (strcmp(string2, "RTO_WAVES") == 0) {
   r->outputFormat = RTO_WAVES;
 }
 if (strcmp(string2, "RTO_GRAPH") == 0) {
   r->outputFormat = RTO_GRAPH;
 }
 if (strcmp(string2, "RTO_MINIMAL_GRAPH") == 0) {
   r->outputFormat = RTO_MINIMAL_GRAPH;
 }
      }
      else {
 RTStarControl(r->engine, string1, string2);
      }
      break;
    }
  case DESTROY:
    {
      char *string = Tcl_GetStringFromObj(objv[0], NULL);

      Tcl_DeleteCommand(interp, string);
      Snack_RemoveCallback(r->snd, r->id);
      CloseRTStarEngine(r->engine);
      ckfree((char *) r);
      break;
    }
  }

  return TCL_OK;
}

/*
  The callback that is executed whenever there is a change to
  the sound, which the recognizer should process.
 */

static void
ProcessUtterance(ClientData clientData, int flag)
{
  Recog *r = (Recog *) clientData;
  Sound *s = r->snd;
  int len = Snack_GetLength(s) - r->forwardPos;
  char *res = NULL;
  Tcl_Obj *cmd = NULL;
  short *buffer = NULL;

  /*
    Sound object has changed as a result of, e.g., record or read.
    Initialize processing of the sound here.
    */

  if (flag == SNACK_NEW_SOUND) {
    r->utterance = InitRTStarUtterance(r->engine);
  }

  /*
    Process all new sound data. Might be a small recorded chunk or all of the     sound contents.
    */

  if (len > 0) {
    if ((buffer = (short *) ckalloc(len * sizeof(short))) == NULL) {
      return;
    }
    Snack_GetSoundData(s, r->forwardPos, buffer, len * sizeof(short));
    RTStarForward(r->utterance, buffer, len);
    ckfree((char *) buffer);
  }
  r->forwardPos = Snack_GetLength(s);

  /*
    If Snack_GetSoundStatus() returns IDLE, either stop was called or the
    sound changed contents through, e.g., a read command. Do postprocessing.
    */

  if (Snack_GetSoundStatus(s) == IDLE) {
    RTStarBackTrace(r->utterance);
    res = RTStarOutput(r->utterance, r->nBest, r->outputFormat);
    Tcl_Preserve((ClientData) r->interp);
    cmd = Tcl_DuplicateObj(r->cmdPtr);
    Tcl_AppendStringsToObj(cmd, " {", res, "}", (char *) NULL);

    /*
      Execute the callback
      */

    if (Tcl_GlobalEvalObj(r->interp, cmd) != TCL_OK) {
      Tcl_AddErrorInfo(r->interp, "\n    (\"command\" script)");
      Tcl_BackgroundError(r->interp);
    }
    Tcl_Release((ClientData) r->interp);
    CloseRTStarUtterance(r->utterance);
    r->forwardPos = 0;
  }
}

int
RTStarEngineCmd(ClientData cdata, Tcl_Interp *interp, int objc,
  Tcl_Obj *CONST objv[])
{
  static int id = 0;
  char name[20];
  char *lexnet_name = NULL;
  char *annfile_name = NULL;
  char *sound_name = NULL;
  Recog *r = (Recog *) ckalloc(sizeof(Recog));

  sprintf(name, "recognizer%d", ++id);

  lexnet_name  = Tcl_GetStringFromObj(objv[1], NULL);
  annfile_name = Tcl_GetStringFromObj(objv[2], NULL);
  sound_name   = Tcl_GetStringFromObj(objv[3], NULL);

  /*
    Initialize a new speech recognition engine.
    */

  r->engine = InitRTStarEngine(lexnet_name, annfile_name, HMMCLASSIFIER);

  if (r->engine == NULL) {
    Tcl_AppendResult(interp, "Error creating RTStarEngine, check filenames",
       NULL);
    return TCL_ERROR;
  }

  /*
    Create a struct with info related to the engine.
    */

  Tcl_IncrRefCount(objv[4]);
  r->cmdPtr = objv[4];
  r->interp = interp;
  if ((r->snd = Snack_GetSound(interp, sound_name)) == NULL) {
    return TCL_ERROR;
  }
  r->id = Snack_AddCallback(r->snd, ProcessUtterance, (int *) r);
  r->forwardPos = 0;
  r->nBest = 1;
  r->outputFormat = RTO_WORDS;

  /*
    Create an object command for this engine.
    */

  Tcl_CreateObjCommand(interp, name, engine_cmd, (ClientData) r,
         (Tcl_CmdDeleteProc *) NULL);
  Tcl_SetObjResult(interp, Tcl_NewStringObj(name, -1));

  return TCL_OK;
}

EXPORT(int, Starlite_Init)(Tcl_Interp *interp)
{
  int code = Tcl_PkgProvide(interp, "starlite", "1.1");

  if (code != TCL_OK) return code;

  Tcl_CreateObjCommand(interp, "recognizer", RTStarEngineCmd,
         NULL, (Tcl_CmdDeleteProc *)NULL);

  return TCL_OK;
}

EXPORT(int, Starlite_SafeInit)(Tcl_Interp *interp)
{
  return Starlite_Init(interp);
}


Snack home