/**
* Manager handling the live transcription of speech into text.
*
* @author Alain Pitiot
* @version 2022.2.3
* @copyright (c) 2022 Open Science Tools Ltd. (https://opensciencetools.org)
* @license Distributed under the terms of the MIT License
*/
import {Clock} from "../util/Clock";
import {PsychObject} from "../util/PsychObject";
import {PsychoJS} from "../core/PsychoJS";
/**
* Transcript.
*/
export class Transcript
{
/**
* Object holding a transcription result.
*
* @param {SpeechRecognition} transcriber - the transcriber
* @param {string} text - the transcript
* @param {number} confidence - confidence in the transcript
*/
constructor(transcriber, text = '', confidence = 0.0)
{
// recognised text:
this.text = text;
// confidence in the recognition:
this.confidence = confidence;
// time the speech started, relative to the Transcriber clock:
this.speechStart = transcriber._speechStart;
// time the speech ended, relative to the Transcriber clock:
this.speechEnd = transcriber._speechEnd;
// time a recognition result was produced, relative to the Transcriber clock:
this.time = transcriber._recognitionTime;
}
}
/**
* <p>This manager handles the live transcription of speech into text.</p>
*
* @extends PsychObject
* @todo deal with alternatives, interim results, and recognition errors
*/
export class SpeechRecognition extends PsychObject
{
/**
* <p>This manager handles the live transcription of speech into text.</p>
*
* @memberOf module:sound
* @param {Object} options
* @param {module:core.PsychoJS} options.psychoJS - the PsychoJS instance
* @param {String} options.name - the name used when logging messages
* @param {number} [options.bufferSize= 10000] - the maximum size of the circular transcript buffer
* @param {String[]} [options.continuous= true] - whether to continuously recognise
* @param {String[]} [options.lang= 'en-US'] - the spoken language
* @param {String[]} [options.interimResults= false] - whether to make interim results available
* @param {String[]} [options.maxAlternatives= 1] - the maximum number of recognition alternatives
* @param {String[]} [options.tokens= [] ] - the tokens to be recognised. This is experimental technology, not available in all browser.
* @param {Clock} [options.clock= undefined] - an optional clock
* @param {boolean} [options.autoLog= false] - whether to log
*
* @todo deal with alternatives, interim results, and recognition errors
*/
constructor({psychoJS, name, bufferSize, continuous, lang, interimResults, maxAlternatives, tokens, clock, autoLog} = {})
{
super(psychoJS);
this._addAttribute('name', name, 'speech recognition');
this._addAttribute('bufferSize', bufferSize, 10000);
this._addAttribute('continuous', continuous, true, this._onChange);
this._addAttribute('lang', lang, 'en-US', this._onChange);
this._addAttribute('interimResults', interimResults, false, this._onChange);
this._addAttribute('maxAlternatives', maxAlternatives, 1, this._onChange);
this._addAttribute('tokens', tokens, [], this._onChange);
this._addAttribute('clock', clock, new Clock());
this._addAttribute('autoLog', false, autoLog);
this._addAttribute('status', PsychoJS.Status.NOT_STARTED);
this._prepareRecognition();
if (this._autoLog)
{
this._psychoJS.experimentLogger.exp(`Created ${this.name} = ${this.toString()}`);
}
}
/**
* Start the speech recognition process.
*
* @return {Promise} promise fulfilled when the process actually starts
*/
start()
{
if (this._status !== PsychoJS.Status.STARTED)
{
this._psychoJS.logger.debug('request to start the speech recognition process');
try
{
if (!this._recognition)
{
throw 'the speech recognition has not been initialised yet, possibly because the participant has not given the authorisation to record audio';
}
this._recognition.start();
// return a promise, which will be satisfied when the process actually starts,
// which is also when the reset of the clock and the change of status takes place
const self = this;
return new Promise((resolve, reject) =>
{
self._startCallback = resolve;
self._errorCallback = reject;
});
}
catch (error)
{
// TODO Strangely, start sometimes fails with the message that the recognition has already started. It is most probably a bug in the implementation of the Web Speech API. We need to catch this particular error and no throw on this occasion
this._psychoJS.logger.error('unable to start the speech to text transcription: ' + JSON.stringify(error));
this._status = PsychoJS.Status.ERROR;
throw {
origin: 'Transcriber.start',
context: 'when starting the speech to text transcription with transcriber: ' + this._name,
error
};
}
}
}
/**
* Stop the speech recognition process.
*
* @return {Promise} promise fulfilled when the process actually stops
*/
stop()
{
if (this._status === PsychoJS.Status.STARTED)
{
this._psychoJS.logger.debug('request to stop the speech recognition process');
this._recognition.stop();
// return a promise, which will be satisfied when the process actually stops:
const self = this;
return new Promise((resolve, reject) =>
{
self._stopCallback = resolve;
self._errorCallback = reject;
});
}
}
/**
* Get the list of transcripts still in the buffer, i.e. those that have not been
* previously cleared by calls to getTranscripts with clear = true.
*
* @param {Object} options
* @param {string[]} [options.transcriptList= []]] - the list of transcripts texts to consider. If transcriptList is empty, we consider all transcripts.
* @param {boolean} [options.clear= false] - whether or not to keep in the buffer the transcripts for a subsequent call to getTranscripts. If a keyList has been given and clear = true, we only remove from the buffer those keys in keyList
* @return {Transcript[]} the list of transcripts still in the buffer
*/
getTranscripts({
transcriptList = [],
clear = true
} = {})
{
// if nothing in the buffer, return immediately:
if (this._bufferLength === 0)
{
return [];
}
// iterate over the buffer, from start to end, and discard the null transcripts (i.e. those
// previously cleared):
const filteredTranscripts = [];
const bufferWrap = (this._bufferLength === this._bufferSize);
let i = bufferWrap ? this._bufferIndex : -1;
do
{
i = (i + 1) % this._bufferSize;
const transcript = this._circularBuffer[i];
if (transcript)
{
// if the transcriptList is empty of the transcript text is in the transcriptList:
if (transcriptList.length === 0 || transcriptList.includes(transcript.text))
{
filteredTranscripts.push(transcript);
if (clear)
{
this._circularBuffer[i] = null;
}
}
}
} while (i !== this._bufferIndex);
return filteredTranscripts;
}
/**
* Clear all transcripts and resets the circular buffers.
*/
clearTranscripts()
{
// circular buffer of transcripts:
this._circularBuffer = new Array(this._bufferSize);
this._bufferLength = 0;
this._bufferIndex = -1;
}
/**
* Callback for changes to the recognition settings.
*
* <p>Changes to the recognition settings require the speech recognition process
* to be stopped and be re-started.</p>
*
* @protected
*/
_onChange()
{
if (this._status === PsychoJS.Status.STARTED)
{
this.stop();
}
this._prepareRecognition();
this.start();
}
/**
* Prepare the speech recognition process.
*
* @protected
*/
_prepareRecognition()
{
// setup the circular buffer of transcripts:
this.clearTranscripts();
// recognition settings:
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
this._recognition = new SpeechRecognition();
this._recognition.continuous = this._continuous;
this._recognition.lang = this._lang;
this._recognition.interimResults = this._interimResults;
this._recognition.maxAlternatives = this._maxAlternatives;
// grammar list with tokens added:
if (Array.isArray(this._tokens) && this._tokens.length > 0)
{
const SpeechGrammarList = window.SpeechGrammarList || window.webkitSpeechGrammarList;
// note: we accepts JSGF encoded strings, and relative weight indicator between 0.0 and 1.0
// ref: https://www.w3.org/TR/jsgf/
const name = "NULL";
const grammar = `#JSGF V1.0; grammar ${name}; public <${name}> = ${this._tokens.join('|')};`
const grammarList = new SpeechGrammarList();
grammarList.addFromString(grammar, 1);
this._recognition.grammars = grammarList;
}
// setup the callbacks:
const self = this;
// called when the start of a speech is detected:
this._recognition.onspeechstart = (e) =>
{
this._currentSpeechStart = this._clock.getTime();
self._psychoJS.logger.debug('speech started');
}
// called when the end of a speech is detected:
this._recognition.onspeechend = () =>
{
this._currentSpeechEnd = this._clock.getTime();
// this._recognition.stop();
self._psychoJS.logger.debug('speech ended');
}
// called when the recognition actually started:
this._recognition.onstart = () =>
{
this._clock.reset();
this._status = PsychoJS.Status.STARTED;
self._psychoJS.logger.debug('speech recognition started');
// resolve the SpeechRecognition.start promise, if need be:
if (self._startCallback())
{
self._startCallback({
time: self._psychoJS.monotonicClock.getTime()
});
}
}
// called whenever stop() or abort() are called:
this._recognition.onend = () =>
{
this._status = PsychoJS.Status.STOPPED;
self._psychoJS.logger.debug('speech recognition ended');
// resolve the SpeechRecognition.stop promise, if need be:
if (self._stopCallback)
{
self._stopCallback({
time: self._psychoJS.monotonicClock.getTime()
});
}
}
// called whenever a new result is available:
this._recognition.onresult = (event) =>
{
this._recognitionTime = this._clock.getTime();
// do not process the results if the Recogniser is not STARTED:
if (self._status !== PsychoJS.Status.STARTED)
{
return;
}
// in continuous recognition mode, we need to get the result at resultIndex,
// otherwise we pick the first result
const resultIndex = (self._continuous) ? event.resultIndex : 0;
// TODO at the moment we consider only the first alternative:
const alternativeIndex = 0;
const results = event.results;
const text = results[resultIndex][alternativeIndex].transcript;
const confidence = results[resultIndex][alternativeIndex].confidence;
// create a new transcript:
const transcript = new Transcript(self, text, confidence);
// insert it in the circular transcript buffer:
self._bufferIndex = (self._bufferIndex + 1) % self._bufferSize;
self._bufferLength = Math.min(self._bufferLength + 1, self._bufferSize);
self._circularBuffer[self._bufferIndex] = transcript;
self._psychoJS.logger.debug('speech recognition transcript: ', JSON.stringify(transcript));
}
// called upon recognition errors:
this._recognition.onerror = (event) =>
{
// lack of speech is not an error:
if (event.error === 'no-speech')
{
return;
}
self._psychoJS.logger.error('speech recognition error: ', JSON.stringify(event));
self._status = PsychoJS.Status.ERROR;
}
}
}